]> git.saurik.com Git - wxWidgets.git/blob - src/png/pngvcrd.c
72c44121316b1cefbbd2f7f3d6e051824d42c69b
[wxWidgets.git] / src / png / pngvcrd.c
1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
5 * libpng version 1.2.4 - July 8, 2002
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2002 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
19 *
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
22 * [runtime MMX configuration, GRR 20010102]
23 *
24 */
25
26 #define PNG_INTERNAL
27 #include "png.h"
28
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30
31 static int mmx_supported=2;
32
33
34 int PNGAPI
35 png_mmx_support(void)
36 {
37 int mmx_supported_local = 0;
38 _asm {
39 push ebx //CPUID will trash these
40 push ecx
41 push edx
42
43 pushfd //Save Eflag to stack
44 pop eax //Get Eflag from stack into eax
45 mov ecx, eax //Make another copy of Eflag in ecx
46 xor eax, 0x200000 //Toggle ID bit in Eflag [i.e. bit(21)]
47 push eax //Save modified Eflag back to stack
48
49 popfd //Restored modified value back to Eflag reg
50 pushfd //Save Eflag to stack
51 pop eax //Get Eflag from stack
52 push ecx // save original Eflag to stack
53 popfd // restore original Eflag
54 xor eax, ecx //Compare the new Eflag with the original Eflag
55 jz NOT_SUPPORTED //If the same, CPUID instruction is not supported,
56 //skip following instructions and jump to
57 //NOT_SUPPORTED label
58
59 xor eax, eax //Set eax to zero
60
61 _asm _emit 0x0f //CPUID instruction (two bytes opcode)
62 _asm _emit 0xa2
63
64 cmp eax, 1 //make sure eax return non-zero value
65 jl NOT_SUPPORTED //If eax is zero, mmx not supported
66
67 xor eax, eax //set eax to zero
68 inc eax //Now increment eax to 1. This instruction is
69 //faster than the instruction "mov eax, 1"
70
71 _asm _emit 0x0f //CPUID instruction
72 _asm _emit 0xa2
73
74 and edx, 0x00800000 //mask out all bits but mmx bit(24)
75 cmp edx, 0 // 0 = mmx not supported
76 jz NOT_SUPPORTED // non-zero = Yes, mmx IS supported
77
78 mov mmx_supported_local, 1 //set return value to 1
79
80 NOT_SUPPORTED:
81 mov eax, mmx_supported_local //move return value to eax
82 pop edx //CPUID trashed these
83 pop ecx
84 pop ebx
85 }
86
87 //mmx_supported_local=0; // test code for force don't support MMX
88 //printf("MMX : %u (1=MMX supported)\n",mmx_supported_local);
89
90 mmx_supported = mmx_supported_local;
91 return mmx_supported_local;
92 }
93
94 /* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
104
105 /* Use this routine for x86 platform - uses faster MMX routine if machine
106 supports MMX */
107
108 void /* PRIVATE */
109 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110 {
111 #ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113 #endif
114
115 png_debug(1,"in png_combine_row_asm\n");
116
117 if (mmx_supported == 2) {
118 /* this should have happened in png_init_mmx_flags() already */
119 png_warning(png_ptr, "asm_flags may not have been initialized");
120 png_mmx_support();
121 }
122
123 if (mask == 0xff)
124 {
125 png_memcpy(row, png_ptr->row_buf + 1,
126 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
127 }
128 /* GRR: add "else if (mask == 0)" case?
129 * or does png_combine_row() not even get called in that case? */
130 else
131 {
132 switch (png_ptr->row_info.pixel_depth)
133 {
134 case 1:
135 {
136 png_bytep sp;
137 png_bytep dp;
138 int s_inc, s_start, s_end;
139 int m;
140 int shift;
141 png_uint_32 i;
142
143 sp = png_ptr->row_buf + 1;
144 dp = row;
145 m = 0x80;
146 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
147 if (png_ptr->transformations & PNG_PACKSWAP)
148 {
149 s_start = 0;
150 s_end = 7;
151 s_inc = 1;
152 }
153 else
154 #endif
155 {
156 s_start = 7;
157 s_end = 0;
158 s_inc = -1;
159 }
160
161 shift = s_start;
162
163 for (i = 0; i < png_ptr->width; i++)
164 {
165 if (m & mask)
166 {
167 int value;
168
169 value = (*sp >> shift) & 0x1;
170 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
171 *dp |= (png_byte)(value << shift);
172 }
173
174 if (shift == s_end)
175 {
176 shift = s_start;
177 sp++;
178 dp++;
179 }
180 else
181 shift += s_inc;
182
183 if (m == 1)
184 m = 0x80;
185 else
186 m >>= 1;
187 }
188 break;
189 }
190
191 case 2:
192 {
193 png_bytep sp;
194 png_bytep dp;
195 int s_start, s_end, s_inc;
196 int m;
197 int shift;
198 png_uint_32 i;
199 int value;
200
201 sp = png_ptr->row_buf + 1;
202 dp = row;
203 m = 0x80;
204 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
205 if (png_ptr->transformations & PNG_PACKSWAP)
206 {
207 s_start = 0;
208 s_end = 6;
209 s_inc = 2;
210 }
211 else
212 #endif
213 {
214 s_start = 6;
215 s_end = 0;
216 s_inc = -2;
217 }
218
219 shift = s_start;
220
221 for (i = 0; i < png_ptr->width; i++)
222 {
223 if (m & mask)
224 {
225 value = (*sp >> shift) & 0x3;
226 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
227 *dp |= (png_byte)(value << shift);
228 }
229
230 if (shift == s_end)
231 {
232 shift = s_start;
233 sp++;
234 dp++;
235 }
236 else
237 shift += s_inc;
238 if (m == 1)
239 m = 0x80;
240 else
241 m >>= 1;
242 }
243 break;
244 }
245
246 case 4:
247 {
248 png_bytep sp;
249 png_bytep dp;
250 int s_start, s_end, s_inc;
251 int m;
252 int shift;
253 png_uint_32 i;
254 int value;
255
256 sp = png_ptr->row_buf + 1;
257 dp = row;
258 m = 0x80;
259 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
260 if (png_ptr->transformations & PNG_PACKSWAP)
261 {
262 s_start = 0;
263 s_end = 4;
264 s_inc = 4;
265 }
266 else
267 #endif
268 {
269 s_start = 4;
270 s_end = 0;
271 s_inc = -4;
272 }
273 shift = s_start;
274
275 for (i = 0; i < png_ptr->width; i++)
276 {
277 if (m & mask)
278 {
279 value = (*sp >> shift) & 0xf;
280 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
281 *dp |= (png_byte)(value << shift);
282 }
283
284 if (shift == s_end)
285 {
286 shift = s_start;
287 sp++;
288 dp++;
289 }
290 else
291 shift += s_inc;
292 if (m == 1)
293 m = 0x80;
294 else
295 m >>= 1;
296 }
297 break;
298 }
299
300 case 8:
301 {
302 png_bytep srcptr;
303 png_bytep dstptr;
304 png_uint_32 len;
305 int m;
306 int diff, unmask;
307
308 __int64 mask0=0x0102040810204080;
309
310 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
311 /* && mmx_supported */ )
312 {
313 srcptr = png_ptr->row_buf + 1;
314 dstptr = row;
315 m = 0x80;
316 unmask = ~mask;
317 len = png_ptr->width &~7; //reduce to multiple of 8
318 diff = png_ptr->width & 7; //amount lost
319
320 _asm
321 {
322 movd mm7, unmask //load bit pattern
323 psubb mm6,mm6 //zero mm6
324 punpcklbw mm7,mm7
325 punpcklwd mm7,mm7
326 punpckldq mm7,mm7 //fill register with 8 masks
327
328 movq mm0,mask0
329
330 pand mm0,mm7 //nonzero if keep byte
331 pcmpeqb mm0,mm6 //zeros->1s, v versa
332
333 mov ecx,len //load length of line (pixels)
334 mov esi,srcptr //load source
335 mov ebx,dstptr //load dest
336 cmp ecx,0 //lcr
337 je mainloop8end
338
339 mainloop8:
340 movq mm4,[esi]
341 pand mm4,mm0
342 movq mm6,mm0
343 pandn mm6,[ebx]
344 por mm4,mm6
345 movq [ebx],mm4
346
347 add esi,8 //inc by 8 bytes processed
348 add ebx,8
349 sub ecx,8 //dec by 8 pixels processed
350
351 ja mainloop8
352 mainloop8end:
353
354 mov ecx,diff
355 cmp ecx,0
356 jz end8
357
358 mov edx,mask
359 sal edx,24 //make low byte the high byte
360
361 secondloop8:
362 sal edx,1 //move high bit to CF
363 jnc skip8 //if CF = 0
364 mov al,[esi]
365 mov [ebx],al
366 skip8:
367 inc esi
368 inc ebx
369
370 dec ecx
371 jnz secondloop8
372 end8:
373 emms
374 }
375 }
376 else /* mmx not supported - use modified C routine */
377 {
378 register unsigned int incr1, initial_val, final_val;
379 png_size_t pixel_bytes;
380 png_uint_32 i;
381 register int disp = png_pass_inc[png_ptr->pass];
382 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
383
384 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
385 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
386 pixel_bytes;
387 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
388 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
389 final_val = png_ptr->width*pixel_bytes;
390 incr1 = (disp)*pixel_bytes;
391 for (i = initial_val; i < final_val; i += incr1)
392 {
393 png_memcpy(dstptr, srcptr, pixel_bytes);
394 srcptr += incr1;
395 dstptr += incr1;
396 }
397 } /* end of else */
398
399 break;
400 } // end 8 bpp
401
402 case 16:
403 {
404 png_bytep srcptr;
405 png_bytep dstptr;
406 png_uint_32 len;
407 int unmask, diff;
408 __int64 mask1=0x0101020204040808,
409 mask0=0x1010202040408080;
410
411 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
412 /* && mmx_supported */ )
413 {
414 srcptr = png_ptr->row_buf + 1;
415 dstptr = row;
416
417 unmask = ~mask;
418 len = (png_ptr->width)&~7;
419 diff = (png_ptr->width)&7;
420 _asm
421 {
422 movd mm7, unmask //load bit pattern
423 psubb mm6,mm6 //zero mm6
424 punpcklbw mm7,mm7
425 punpcklwd mm7,mm7
426 punpckldq mm7,mm7 //fill register with 8 masks
427
428 movq mm0,mask0
429 movq mm1,mask1
430
431 pand mm0,mm7
432 pand mm1,mm7
433
434 pcmpeqb mm0,mm6
435 pcmpeqb mm1,mm6
436
437 mov ecx,len //load length of line
438 mov esi,srcptr //load source
439 mov ebx,dstptr //load dest
440 cmp ecx,0 //lcr
441 jz mainloop16end
442
443 mainloop16:
444 movq mm4,[esi]
445 pand mm4,mm0
446 movq mm6,mm0
447 movq mm7,[ebx]
448 pandn mm6,mm7
449 por mm4,mm6
450 movq [ebx],mm4
451
452 movq mm5,[esi+8]
453 pand mm5,mm1
454 movq mm7,mm1
455 movq mm6,[ebx+8]
456 pandn mm7,mm6
457 por mm5,mm7
458 movq [ebx+8],mm5
459
460 add esi,16 //inc by 16 bytes processed
461 add ebx,16
462 sub ecx,8 //dec by 8 pixels processed
463
464 ja mainloop16
465
466 mainloop16end:
467 mov ecx,diff
468 cmp ecx,0
469 jz end16
470
471 mov edx,mask
472 sal edx,24 //make low byte the high byte
473 secondloop16:
474 sal edx,1 //move high bit to CF
475 jnc skip16 //if CF = 0
476 mov ax,[esi]
477 mov [ebx],ax
478 skip16:
479 add esi,2
480 add ebx,2
481
482 dec ecx
483 jnz secondloop16
484 end16:
485 emms
486 }
487 }
488 else /* mmx not supported - use modified C routine */
489 {
490 register unsigned int incr1, initial_val, final_val;
491 png_size_t pixel_bytes;
492 png_uint_32 i;
493 register int disp = png_pass_inc[png_ptr->pass];
494 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
495
496 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
497 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
498 pixel_bytes;
499 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
500 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
501 final_val = png_ptr->width*pixel_bytes;
502 incr1 = (disp)*pixel_bytes;
503 for (i = initial_val; i < final_val; i += incr1)
504 {
505 png_memcpy(dstptr, srcptr, pixel_bytes);
506 srcptr += incr1;
507 dstptr += incr1;
508 }
509 } /* end of else */
510
511 break;
512 } // end 16 bpp
513
514 case 24:
515 {
516 png_bytep srcptr;
517 png_bytep dstptr;
518 png_uint_32 len;
519 int unmask, diff;
520
521 __int64 mask2=0x0101010202020404, //24bpp
522 mask1=0x0408080810101020,
523 mask0=0x2020404040808080;
524
525 srcptr = png_ptr->row_buf + 1;
526 dstptr = row;
527
528 unmask = ~mask;
529 len = (png_ptr->width)&~7;
530 diff = (png_ptr->width)&7;
531
532 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
533 /* && mmx_supported */ )
534 {
535 _asm
536 {
537 movd mm7, unmask //load bit pattern
538 psubb mm6,mm6 //zero mm6
539 punpcklbw mm7,mm7
540 punpcklwd mm7,mm7
541 punpckldq mm7,mm7 //fill register with 8 masks
542
543 movq mm0,mask0
544 movq mm1,mask1
545 movq mm2,mask2
546
547 pand mm0,mm7
548 pand mm1,mm7
549 pand mm2,mm7
550
551 pcmpeqb mm0,mm6
552 pcmpeqb mm1,mm6
553 pcmpeqb mm2,mm6
554
555 mov ecx,len //load length of line
556 mov esi,srcptr //load source
557 mov ebx,dstptr //load dest
558 cmp ecx,0
559 jz mainloop24end
560
561 mainloop24:
562 movq mm4,[esi]
563 pand mm4,mm0
564 movq mm6,mm0
565 movq mm7,[ebx]
566 pandn mm6,mm7
567 por mm4,mm6
568 movq [ebx],mm4
569
570
571 movq mm5,[esi+8]
572 pand mm5,mm1
573 movq mm7,mm1
574 movq mm6,[ebx+8]
575 pandn mm7,mm6
576 por mm5,mm7
577 movq [ebx+8],mm5
578
579 movq mm6,[esi+16]
580 pand mm6,mm2
581 movq mm4,mm2
582 movq mm7,[ebx+16]
583 pandn mm4,mm7
584 por mm6,mm4
585 movq [ebx+16],mm6
586
587 add esi,24 //inc by 24 bytes processed
588 add ebx,24
589 sub ecx,8 //dec by 8 pixels processed
590
591 ja mainloop24
592
593 mainloop24end:
594 mov ecx,diff
595 cmp ecx,0
596 jz end24
597
598 mov edx,mask
599 sal edx,24 //make low byte the high byte
600 secondloop24:
601 sal edx,1 //move high bit to CF
602 jnc skip24 //if CF = 0
603 mov ax,[esi]
604 mov [ebx],ax
605 xor eax,eax
606 mov al,[esi+2]
607 mov [ebx+2],al
608 skip24:
609 add esi,3
610 add ebx,3
611
612 dec ecx
613 jnz secondloop24
614
615 end24:
616 emms
617 }
618 }
619 else /* mmx not supported - use modified C routine */
620 {
621 register unsigned int incr1, initial_val, final_val;
622 png_size_t pixel_bytes;
623 png_uint_32 i;
624 register int disp = png_pass_inc[png_ptr->pass];
625 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
626
627 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
628 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
629 pixel_bytes;
630 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
631 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
632 final_val = png_ptr->width*pixel_bytes;
633 incr1 = (disp)*pixel_bytes;
634 for (i = initial_val; i < final_val; i += incr1)
635 {
636 png_memcpy(dstptr, srcptr, pixel_bytes);
637 srcptr += incr1;
638 dstptr += incr1;
639 }
640 } /* end of else */
641
642 break;
643 } // end 24 bpp
644
645 case 32:
646 {
647 png_bytep srcptr;
648 png_bytep dstptr;
649 png_uint_32 len;
650 int unmask, diff;
651
652 __int64 mask3=0x0101010102020202, //32bpp
653 mask2=0x0404040408080808,
654 mask1=0x1010101020202020,
655 mask0=0x4040404080808080;
656
657 srcptr = png_ptr->row_buf + 1;
658 dstptr = row;
659
660 unmask = ~mask;
661 len = (png_ptr->width)&~7;
662 diff = (png_ptr->width)&7;
663
664 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
665 /* && mmx_supported */ )
666 {
667 _asm
668 {
669 movd mm7, unmask //load bit pattern
670 psubb mm6,mm6 //zero mm6
671 punpcklbw mm7,mm7
672 punpcklwd mm7,mm7
673 punpckldq mm7,mm7 //fill register with 8 masks
674
675 movq mm0,mask0
676 movq mm1,mask1
677 movq mm2,mask2
678 movq mm3,mask3
679
680 pand mm0,mm7
681 pand mm1,mm7
682 pand mm2,mm7
683 pand mm3,mm7
684
685 pcmpeqb mm0,mm6
686 pcmpeqb mm1,mm6
687 pcmpeqb mm2,mm6
688 pcmpeqb mm3,mm6
689
690 mov ecx,len //load length of line
691 mov esi,srcptr //load source
692 mov ebx,dstptr //load dest
693
694 cmp ecx,0 //lcr
695 jz mainloop32end
696
697 mainloop32:
698 movq mm4,[esi]
699 pand mm4,mm0
700 movq mm6,mm0
701 movq mm7,[ebx]
702 pandn mm6,mm7
703 por mm4,mm6
704 movq [ebx],mm4
705
706 movq mm5,[esi+8]
707 pand mm5,mm1
708 movq mm7,mm1
709 movq mm6,[ebx+8]
710 pandn mm7,mm6
711 por mm5,mm7
712 movq [ebx+8],mm5
713
714 movq mm6,[esi+16]
715 pand mm6,mm2
716 movq mm4,mm2
717 movq mm7,[ebx+16]
718 pandn mm4,mm7
719 por mm6,mm4
720 movq [ebx+16],mm6
721
722 movq mm7,[esi+24]
723 pand mm7,mm3
724 movq mm5,mm3
725 movq mm4,[ebx+24]
726 pandn mm5,mm4
727 por mm7,mm5
728 movq [ebx+24],mm7
729
730 add esi,32 //inc by 32 bytes processed
731 add ebx,32
732 sub ecx,8 //dec by 8 pixels processed
733
734 ja mainloop32
735
736 mainloop32end:
737 mov ecx,diff
738 cmp ecx,0
739 jz end32
740
741 mov edx,mask
742 sal edx,24 //make low byte the high byte
743 secondloop32:
744 sal edx,1 //move high bit to CF
745 jnc skip32 //if CF = 0
746 mov eax,[esi]
747 mov [ebx],eax
748 skip32:
749 add esi,4
750 add ebx,4
751
752 dec ecx
753 jnz secondloop32
754
755 end32:
756 emms
757 }
758 }
759 else /* mmx _not supported - Use modified C routine */
760 {
761 register unsigned int incr1, initial_val, final_val;
762 png_size_t pixel_bytes;
763 png_uint_32 i;
764 register int disp = png_pass_inc[png_ptr->pass];
765 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
766
767 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
768 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
769 pixel_bytes;
770 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
771 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
772 final_val = png_ptr->width*pixel_bytes;
773 incr1 = (disp)*pixel_bytes;
774 for (i = initial_val; i < final_val; i += incr1)
775 {
776 png_memcpy(dstptr, srcptr, pixel_bytes);
777 srcptr += incr1;
778 dstptr += incr1;
779 }
780 } /* end of else */
781
782 break;
783 } // end 32 bpp
784
785 case 48:
786 {
787 png_bytep srcptr;
788 png_bytep dstptr;
789 png_uint_32 len;
790 int unmask, diff;
791
792 __int64 mask5=0x0101010101010202,
793 mask4=0x0202020204040404,
794 mask3=0x0404080808080808,
795 mask2=0x1010101010102020,
796 mask1=0x2020202040404040,
797 mask0=0x4040808080808080;
798
799 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
800 /* && mmx_supported */ )
801 {
802 srcptr = png_ptr->row_buf + 1;
803 dstptr = row;
804
805 unmask = ~mask;
806 len = (png_ptr->width)&~7;
807 diff = (png_ptr->width)&7;
808 _asm
809 {
810 movd mm7, unmask //load bit pattern
811 psubb mm6,mm6 //zero mm6
812 punpcklbw mm7,mm7
813 punpcklwd mm7,mm7
814 punpckldq mm7,mm7 //fill register with 8 masks
815
816 movq mm0,mask0
817 movq mm1,mask1
818 movq mm2,mask2
819 movq mm3,mask3
820 movq mm4,mask4
821 movq mm5,mask5
822
823 pand mm0,mm7
824 pand mm1,mm7
825 pand mm2,mm7
826 pand mm3,mm7
827 pand mm4,mm7
828 pand mm5,mm7
829
830 pcmpeqb mm0,mm6
831 pcmpeqb mm1,mm6
832 pcmpeqb mm2,mm6
833 pcmpeqb mm3,mm6
834 pcmpeqb mm4,mm6
835 pcmpeqb mm5,mm6
836
837 mov ecx,len //load length of line
838 mov esi,srcptr //load source
839 mov ebx,dstptr //load dest
840
841 cmp ecx,0
842 jz mainloop48end
843
844 mainloop48:
845 movq mm7,[esi]
846 pand mm7,mm0
847 movq mm6,mm0
848 pandn mm6,[ebx]
849 por mm7,mm6
850 movq [ebx],mm7
851
852 movq mm6,[esi+8]
853 pand mm6,mm1
854 movq mm7,mm1
855 pandn mm7,[ebx+8]
856 por mm6,mm7
857 movq [ebx+8],mm6
858
859 movq mm6,[esi+16]
860 pand mm6,mm2
861 movq mm7,mm2
862 pandn mm7,[ebx+16]
863 por mm6,mm7
864 movq [ebx+16],mm6
865
866 movq mm7,[esi+24]
867 pand mm7,mm3
868 movq mm6,mm3
869 pandn mm6,[ebx+24]
870 por mm7,mm6
871 movq [ebx+24],mm7
872
873 movq mm6,[esi+32]
874 pand mm6,mm4
875 movq mm7,mm4
876 pandn mm7,[ebx+32]
877 por mm6,mm7
878 movq [ebx+32],mm6
879
880 movq mm7,[esi+40]
881 pand mm7,mm5
882 movq mm6,mm5
883 pandn mm6,[ebx+40]
884 por mm7,mm6
885 movq [ebx+40],mm7
886
887 add esi,48 //inc by 32 bytes processed
888 add ebx,48
889 sub ecx,8 //dec by 8 pixels processed
890
891 ja mainloop48
892 mainloop48end:
893
894 mov ecx,diff
895 cmp ecx,0
896 jz end48
897
898 mov edx,mask
899 sal edx,24 //make low byte the high byte
900
901 secondloop48:
902 sal edx,1 //move high bit to CF
903 jnc skip48 //if CF = 0
904 mov eax,[esi]
905 mov [ebx],eax
906 skip48:
907 add esi,4
908 add ebx,4
909
910 dec ecx
911 jnz secondloop48
912
913 end48:
914 emms
915 }
916 }
917 else /* mmx _not supported - Use modified C routine */
918 {
919 register unsigned int incr1, initial_val, final_val;
920 png_size_t pixel_bytes;
921 png_uint_32 i;
922 register int disp = png_pass_inc[png_ptr->pass];
923 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
924
925 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
926 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
927 pixel_bytes;
928 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
929 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
930 final_val = png_ptr->width*pixel_bytes;
931 incr1 = (disp)*pixel_bytes;
932 for (i = initial_val; i < final_val; i += incr1)
933 {
934 png_memcpy(dstptr, srcptr, pixel_bytes);
935 srcptr += incr1;
936 dstptr += incr1;
937 }
938 } /* end of else */
939
940 break;
941 } // end 48 bpp
942
943 default:
944 {
945 png_bytep sptr;
946 png_bytep dp;
947 png_size_t pixel_bytes;
948 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
949 unsigned int i;
950 register int disp = png_pass_inc[png_ptr->pass]; // get the offset
951 register unsigned int incr1, initial_val, final_val;
952
953 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
954 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
955 pixel_bytes;
956 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
957 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
958 final_val = png_ptr->width*pixel_bytes;
959 incr1 = (disp)*pixel_bytes;
960 for (i = initial_val; i < final_val; i += incr1)
961 {
962 png_memcpy(dp, sptr, pixel_bytes);
963 sptr += incr1;
964 dp += incr1;
965 }
966 break;
967 }
968 } /* end switch (png_ptr->row_info.pixel_depth) */
969 } /* end if (non-trivial mask) */
970
971 } /* end png_combine_row() */
972
973
974 #if defined(PNG_READ_INTERLACING_SUPPORTED)
975
976 void /* PRIVATE */
977 png_do_read_interlace(png_structp png_ptr)
978 {
979 png_row_infop row_info = &(png_ptr->row_info);
980 png_bytep row = png_ptr->row_buf + 1;
981 int pass = png_ptr->pass;
982 png_uint_32 transformations = png_ptr->transformations;
983 #ifdef PNG_USE_LOCAL_ARRAYS
984 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
985 #endif
986
987 png_debug(1,"in png_do_read_interlace\n");
988
989 if (mmx_supported == 2) {
990 /* this should have happened in png_init_mmx_flags() already */
991 png_warning(png_ptr, "asm_flags may not have been initialized");
992 png_mmx_support();
993 }
994
995 if (row != NULL && row_info != NULL)
996 {
997 png_uint_32 final_width;
998
999 final_width = row_info->width * png_pass_inc[pass];
1000
1001 switch (row_info->pixel_depth)
1002 {
1003 case 1:
1004 {
1005 png_bytep sp, dp;
1006 int sshift, dshift;
1007 int s_start, s_end, s_inc;
1008 png_byte v;
1009 png_uint_32 i;
1010 int j;
1011
1012 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1013 dp = row + (png_size_t)((final_width - 1) >> 3);
1014 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1015 if (transformations & PNG_PACKSWAP)
1016 {
1017 sshift = (int)((row_info->width + 7) & 7);
1018 dshift = (int)((final_width + 7) & 7);
1019 s_start = 7;
1020 s_end = 0;
1021 s_inc = -1;
1022 }
1023 else
1024 #endif
1025 {
1026 sshift = 7 - (int)((row_info->width + 7) & 7);
1027 dshift = 7 - (int)((final_width + 7) & 7);
1028 s_start = 0;
1029 s_end = 7;
1030 s_inc = 1;
1031 }
1032
1033 for (i = row_info->width; i; i--)
1034 {
1035 v = (png_byte)((*sp >> sshift) & 0x1);
1036 for (j = 0; j < png_pass_inc[pass]; j++)
1037 {
1038 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1039 *dp |= (png_byte)(v << dshift);
1040 if (dshift == s_end)
1041 {
1042 dshift = s_start;
1043 dp--;
1044 }
1045 else
1046 dshift += s_inc;
1047 }
1048 if (sshift == s_end)
1049 {
1050 sshift = s_start;
1051 sp--;
1052 }
1053 else
1054 sshift += s_inc;
1055 }
1056 break;
1057 }
1058
1059 case 2:
1060 {
1061 png_bytep sp, dp;
1062 int sshift, dshift;
1063 int s_start, s_end, s_inc;
1064 png_uint_32 i;
1065
1066 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1067 dp = row + (png_size_t)((final_width - 1) >> 2);
1068 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1069 if (transformations & PNG_PACKSWAP)
1070 {
1071 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1072 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1073 s_start = 6;
1074 s_end = 0;
1075 s_inc = -2;
1076 }
1077 else
1078 #endif
1079 {
1080 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1081 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1082 s_start = 0;
1083 s_end = 6;
1084 s_inc = 2;
1085 }
1086
1087 for (i = row_info->width; i; i--)
1088 {
1089 png_byte v;
1090 int j;
1091
1092 v = (png_byte)((*sp >> sshift) & 0x3);
1093 for (j = 0; j < png_pass_inc[pass]; j++)
1094 {
1095 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1096 *dp |= (png_byte)(v << dshift);
1097 if (dshift == s_end)
1098 {
1099 dshift = s_start;
1100 dp--;
1101 }
1102 else
1103 dshift += s_inc;
1104 }
1105 if (sshift == s_end)
1106 {
1107 sshift = s_start;
1108 sp--;
1109 }
1110 else
1111 sshift += s_inc;
1112 }
1113 break;
1114 }
1115
1116 case 4:
1117 {
1118 png_bytep sp, dp;
1119 int sshift, dshift;
1120 int s_start, s_end, s_inc;
1121 png_uint_32 i;
1122
1123 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1124 dp = row + (png_size_t)((final_width - 1) >> 1);
1125 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1126 if (transformations & PNG_PACKSWAP)
1127 {
1128 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1129 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1130 s_start = 4;
1131 s_end = 0;
1132 s_inc = -4;
1133 }
1134 else
1135 #endif
1136 {
1137 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1138 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1139 s_start = 0;
1140 s_end = 4;
1141 s_inc = 4;
1142 }
1143
1144 for (i = row_info->width; i; i--)
1145 {
1146 png_byte v;
1147 int j;
1148
1149 v = (png_byte)((*sp >> sshift) & 0xf);
1150 for (j = 0; j < png_pass_inc[pass]; j++)
1151 {
1152 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1153 *dp |= (png_byte)(v << dshift);
1154 if (dshift == s_end)
1155 {
1156 dshift = s_start;
1157 dp--;
1158 }
1159 else
1160 dshift += s_inc;
1161 }
1162 if (sshift == s_end)
1163 {
1164 sshift = s_start;
1165 sp--;
1166 }
1167 else
1168 sshift += s_inc;
1169 }
1170 break;
1171 }
1172
1173 default: // This is the place where the routine is modified
1174 {
1175 __int64 const4 = 0x0000000000FFFFFF;
1176 // __int64 const5 = 0x000000FFFFFF0000; // unused...
1177 __int64 const6 = 0x00000000000000FF;
1178 png_bytep sptr, dp;
1179 png_uint_32 i;
1180 png_size_t pixel_bytes;
1181 int width = row_info->width;
1182
1183 pixel_bytes = (row_info->pixel_depth >> 3);
1184
1185 sptr = row + (width - 1) * pixel_bytes;
1186 dp = row + (final_width - 1) * pixel_bytes;
1187 // New code by Nirav Chhatrapati - Intel Corporation
1188 // sign fix by GRR
1189 // NOTE: there is NO MMX code for 48-bit and 64-bit images
1190
1191 // use MMX routine if machine supports it
1192 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1193 /* && mmx_supported */ )
1194 {
1195 if (pixel_bytes == 3)
1196 {
1197 if (((pass == 0) || (pass == 1)) && width)
1198 {
1199 _asm
1200 {
1201 mov esi, sptr
1202 mov edi, dp
1203 mov ecx, width
1204 sub edi, 21 // (png_pass_inc[pass] - 1)*pixel_bytes
1205 loop_pass0:
1206 movd mm0, [esi] ; X X X X X v2 v1 v0
1207 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1208 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1209 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1210 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1211 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1212 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1213 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1214 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1215 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1216 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1217 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1218 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1219 movq [edi+16] , mm4
1220 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1221 movq [edi+8] , mm3
1222 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1223 sub esi, 3
1224 movq [edi], mm0
1225 sub edi, 24
1226 //sub esi, 3
1227 dec ecx
1228 jnz loop_pass0
1229 EMMS
1230 }
1231 }
1232 else if (((pass == 2) || (pass == 3)) && width)
1233 {
1234 _asm
1235 {
1236 mov esi, sptr
1237 mov edi, dp
1238 mov ecx, width
1239 sub edi, 9 // (png_pass_inc[pass] - 1)*pixel_bytes
1240 loop_pass2:
1241 movd mm0, [esi] ; X X X X X v2 v1 v0
1242 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1243 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1244 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1245 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1246 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1247 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1248 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1249 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1250 movq [edi+4], mm0 ; move to memory
1251 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1252 movd [edi], mm0 ; move to memory
1253 sub esi, 3
1254 sub edi, 12
1255 dec ecx
1256 jnz loop_pass2
1257 EMMS
1258 }
1259 }
1260 else if (width) /* && ((pass == 4) || (pass == 5)) */
1261 {
1262 int width_mmx = ((width >> 1) << 1) - 8;
1263 if (width_mmx < 0)
1264 width_mmx = 0;
1265 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1266 if (width_mmx)
1267 {
1268 _asm
1269 {
1270 mov esi, sptr
1271 mov edi, dp
1272 mov ecx, width_mmx
1273 sub esi, 3
1274 sub edi, 9
1275 loop_pass4:
1276 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1277 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1278 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1279 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1280 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1281 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1282 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1283 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1284 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1285 movq [edi], mm0 ; move quad to memory
1286 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1287 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1288 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1289 movd [edi+8], mm6 ; move double to memory
1290 sub esi, 6
1291 sub edi, 12
1292 sub ecx, 2
1293 jnz loop_pass4
1294 EMMS
1295 }
1296 }
1297
1298 sptr -= width_mmx*3;
1299 dp -= width_mmx*6;
1300 for (i = width; i; i--)
1301 {
1302 png_byte v[8];
1303 int j;
1304
1305 png_memcpy(v, sptr, 3);
1306 for (j = 0; j < png_pass_inc[pass]; j++)
1307 {
1308 png_memcpy(dp, v, 3);
1309 dp -= 3;
1310 }
1311 sptr -= 3;
1312 }
1313 }
1314 } /* end of pixel_bytes == 3 */
1315
1316 else if (pixel_bytes == 1)
1317 {
1318 if (((pass == 0) || (pass == 1)) && width)
1319 {
1320 int width_mmx = ((width >> 2) << 2);
1321 width -= width_mmx;
1322 if (width_mmx)
1323 {
1324 _asm
1325 {
1326 mov esi, sptr
1327 mov edi, dp
1328 mov ecx, width_mmx
1329 sub edi, 31
1330 sub esi, 3
1331 loop1_pass0:
1332 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1333 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1334 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1335 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1336 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1337 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1338 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1339 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1340 movq [edi], mm0 ; move to memory v3
1341 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1342 movq [edi+8], mm3 ; move to memory v2
1343 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1344 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1345 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1346 movq [edi+16], mm2 ; move to memory v1
1347 movq [edi+24], mm4 ; move to memory v0
1348 sub esi, 4
1349 sub edi, 32
1350 sub ecx, 4
1351 jnz loop1_pass0
1352 EMMS
1353 }
1354 }
1355
1356 sptr -= width_mmx;
1357 dp -= width_mmx*8;
1358 for (i = width; i; i--)
1359 {
1360 int j;
1361
1362 /* I simplified this part in version 1.0.4e
1363 * here and in several other instances where
1364 * pixel_bytes == 1 -- GR-P
1365 *
1366 * Original code:
1367 *
1368 * png_byte v[8];
1369 * png_memcpy(v, sptr, pixel_bytes);
1370 * for (j = 0; j < png_pass_inc[pass]; j++)
1371 * {
1372 * png_memcpy(dp, v, pixel_bytes);
1373 * dp -= pixel_bytes;
1374 * }
1375 * sptr -= pixel_bytes;
1376 *
1377 * Replacement code is in the next three lines:
1378 */
1379
1380 for (j = 0; j < png_pass_inc[pass]; j++)
1381 *dp-- = *sptr;
1382 sptr--;
1383 }
1384 }
1385 else if (((pass == 2) || (pass == 3)) && width)
1386 {
1387 int width_mmx = ((width >> 2) << 2);
1388 width -= width_mmx;
1389 if (width_mmx)
1390 {
1391 _asm
1392 {
1393 mov esi, sptr
1394 mov edi, dp
1395 mov ecx, width_mmx
1396 sub edi, 15
1397 sub esi, 3
1398 loop1_pass2:
1399 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1400 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1401 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1402 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1403 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1404 movq [edi], mm0 ; move to memory v2 and v3
1405 sub esi, 4
1406 movq [edi+8], mm1 ; move to memory v1 and v0
1407 sub edi, 16
1408 sub ecx, 4
1409 jnz loop1_pass2
1410 EMMS
1411 }
1412 }
1413
1414 sptr -= width_mmx;
1415 dp -= width_mmx*4;
1416 for (i = width; i; i--)
1417 {
1418 int j;
1419
1420 for (j = 0; j < png_pass_inc[pass]; j++)
1421 {
1422 *dp-- = *sptr;
1423 }
1424 sptr --;
1425 }
1426 }
1427 else if (width) /* && ((pass == 4) || (pass == 5))) */
1428 {
1429 int width_mmx = ((width >> 3) << 3);
1430 width -= width_mmx;
1431 if (width_mmx)
1432 {
1433 _asm
1434 {
1435 mov esi, sptr
1436 mov edi, dp
1437 mov ecx, width_mmx
1438 sub edi, 15
1439 sub esi, 7
1440 loop1_pass4:
1441 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1442 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1443 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1444 //movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1445 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1446 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1447 sub esi, 8
1448 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1449 //sub esi, 4
1450 sub edi, 16
1451 sub ecx, 8
1452 jnz loop1_pass4
1453 EMMS
1454 }
1455 }
1456
1457 sptr -= width_mmx;
1458 dp -= width_mmx*2;
1459 for (i = width; i; i--)
1460 {
1461 int j;
1462
1463 for (j = 0; j < png_pass_inc[pass]; j++)
1464 {
1465 *dp-- = *sptr;
1466 }
1467 sptr --;
1468 }
1469 }
1470 } /* end of pixel_bytes == 1 */
1471
1472 else if (pixel_bytes == 2)
1473 {
1474 if (((pass == 0) || (pass == 1)) && width)
1475 {
1476 int width_mmx = ((width >> 1) << 1);
1477 width -= width_mmx;
1478 if (width_mmx)
1479 {
1480 _asm
1481 {
1482 mov esi, sptr
1483 mov edi, dp
1484 mov ecx, width_mmx
1485 sub esi, 2
1486 sub edi, 30
1487 loop2_pass0:
1488 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1489 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1490 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1491 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1492 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1493 movq [edi], mm0
1494 movq [edi + 8], mm0
1495 movq [edi + 16], mm1
1496 movq [edi + 24], mm1
1497 sub esi, 4
1498 sub edi, 32
1499 sub ecx, 2
1500 jnz loop2_pass0
1501 EMMS
1502 }
1503 }
1504
1505 sptr -= (width_mmx*2 - 2); // sign fixed
1506 dp -= (width_mmx*16 - 2); // sign fixed
1507 for (i = width; i; i--)
1508 {
1509 png_byte v[8];
1510 int j;
1511 sptr -= 2;
1512 png_memcpy(v, sptr, 2);
1513 for (j = 0; j < png_pass_inc[pass]; j++)
1514 {
1515 dp -= 2;
1516 png_memcpy(dp, v, 2);
1517 }
1518 }
1519 }
1520 else if (((pass == 2) || (pass == 3)) && width)
1521 {
1522 int width_mmx = ((width >> 1) << 1) ;
1523 width -= width_mmx;
1524 if (width_mmx)
1525 {
1526 _asm
1527 {
1528 mov esi, sptr
1529 mov edi, dp
1530 mov ecx, width_mmx
1531 sub esi, 2
1532 sub edi, 14
1533 loop2_pass2:
1534 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1535 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1536 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1537 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1538 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1539 movq [edi], mm0
1540 sub esi, 4
1541 movq [edi + 8], mm1
1542 //sub esi, 4
1543 sub edi, 16
1544 sub ecx, 2
1545 jnz loop2_pass2
1546 EMMS
1547 }
1548 }
1549
1550 sptr -= (width_mmx*2 - 2); // sign fixed
1551 dp -= (width_mmx*8 - 2); // sign fixed
1552 for (i = width; i; i--)
1553 {
1554 png_byte v[8];
1555 int j;
1556 sptr -= 2;
1557 png_memcpy(v, sptr, 2);
1558 for (j = 0; j < png_pass_inc[pass]; j++)
1559 {
1560 dp -= 2;
1561 png_memcpy(dp, v, 2);
1562 }
1563 }
1564 }
1565 else if (width) // pass == 4 or 5
1566 {
1567 int width_mmx = ((width >> 1) << 1) ;
1568 width -= width_mmx;
1569 if (width_mmx)
1570 {
1571 _asm
1572 {
1573 mov esi, sptr
1574 mov edi, dp
1575 mov ecx, width_mmx
1576 sub esi, 2
1577 sub edi, 6
1578 loop2_pass4:
1579 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1580 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1581 sub esi, 4
1582 movq [edi], mm0
1583 sub edi, 8
1584 sub ecx, 2
1585 jnz loop2_pass4
1586 EMMS
1587 }
1588 }
1589
1590 sptr -= (width_mmx*2 - 2); // sign fixed
1591 dp -= (width_mmx*4 - 2); // sign fixed
1592 for (i = width; i; i--)
1593 {
1594 png_byte v[8];
1595 int j;
1596 sptr -= 2;
1597 png_memcpy(v, sptr, 2);
1598 for (j = 0; j < png_pass_inc[pass]; j++)
1599 {
1600 dp -= 2;
1601 png_memcpy(dp, v, 2);
1602 }
1603 }
1604 }
1605 } /* end of pixel_bytes == 2 */
1606
1607 else if (pixel_bytes == 4)
1608 {
1609 if (((pass == 0) || (pass == 1)) && width)
1610 {
1611 int width_mmx = ((width >> 1) << 1) ;
1612 width -= width_mmx;
1613 if (width_mmx)
1614 {
1615 _asm
1616 {
1617 mov esi, sptr
1618 mov edi, dp
1619 mov ecx, width_mmx
1620 sub esi, 4
1621 sub edi, 60
1622 loop4_pass0:
1623 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1624 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1625 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1626 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1627 movq [edi], mm0
1628 movq [edi + 8], mm0
1629 movq [edi + 16], mm0
1630 movq [edi + 24], mm0
1631 movq [edi+32], mm1
1632 movq [edi + 40], mm1
1633 movq [edi+ 48], mm1
1634 sub esi, 8
1635 movq [edi + 56], mm1
1636 sub edi, 64
1637 sub ecx, 2
1638 jnz loop4_pass0
1639 EMMS
1640 }
1641 }
1642
1643 sptr -= (width_mmx*4 - 4); // sign fixed
1644 dp -= (width_mmx*32 - 4); // sign fixed
1645 for (i = width; i; i--)
1646 {
1647 png_byte v[8];
1648 int j;
1649 sptr -= 4;
1650 png_memcpy(v, sptr, 4);
1651 for (j = 0; j < png_pass_inc[pass]; j++)
1652 {
1653 dp -= 4;
1654 png_memcpy(dp, v, 4);
1655 }
1656 }
1657 }
1658 else if (((pass == 2) || (pass == 3)) && width)
1659 {
1660 int width_mmx = ((width >> 1) << 1) ;
1661 width -= width_mmx;
1662 if (width_mmx)
1663 {
1664 _asm
1665 {
1666 mov esi, sptr
1667 mov edi, dp
1668 mov ecx, width_mmx
1669 sub esi, 4
1670 sub edi, 28
1671 loop4_pass2:
1672 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1673 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1674 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1675 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1676 movq [edi], mm0
1677 movq [edi + 8], mm0
1678 movq [edi+16], mm1
1679 movq [edi + 24], mm1
1680 sub esi, 8
1681 sub edi, 32
1682 sub ecx, 2
1683 jnz loop4_pass2
1684 EMMS
1685 }
1686 }
1687
1688 sptr -= (width_mmx*4 - 4); // sign fixed
1689 dp -= (width_mmx*16 - 4); // sign fixed
1690 for (i = width; i; i--)
1691 {
1692 png_byte v[8];
1693 int j;
1694 sptr -= 4;
1695 png_memcpy(v, sptr, 4);
1696 for (j = 0; j < png_pass_inc[pass]; j++)
1697 {
1698 dp -= 4;
1699 png_memcpy(dp, v, 4);
1700 }
1701 }
1702 }
1703 else if (width) // pass == 4 or 5
1704 {
1705 int width_mmx = ((width >> 1) << 1) ;
1706 width -= width_mmx;
1707 if (width_mmx)
1708 {
1709 _asm
1710 {
1711 mov esi, sptr
1712 mov edi, dp
1713 mov ecx, width_mmx
1714 sub esi, 4
1715 sub edi, 12
1716 loop4_pass4:
1717 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1718 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1719 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1720 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1721 movq [edi], mm0
1722 sub esi, 8
1723 movq [edi + 8], mm1
1724 sub edi, 16
1725 sub ecx, 2
1726 jnz loop4_pass4
1727 EMMS
1728 }
1729 }
1730
1731 sptr -= (width_mmx*4 - 4); // sign fixed
1732 dp -= (width_mmx*8 - 4); // sign fixed
1733 for (i = width; i; i--)
1734 {
1735 png_byte v[8];
1736 int j;
1737 sptr -= 4;
1738 png_memcpy(v, sptr, 4);
1739 for (j = 0; j < png_pass_inc[pass]; j++)
1740 {
1741 dp -= 4;
1742 png_memcpy(dp, v, 4);
1743 }
1744 }
1745 }
1746
1747 } /* end of pixel_bytes == 4 */
1748
1749 else if (pixel_bytes == 6)
1750 {
1751 for (i = width; i; i--)
1752 {
1753 png_byte v[8];
1754 int j;
1755 png_memcpy(v, sptr, 6);
1756 for (j = 0; j < png_pass_inc[pass]; j++)
1757 {
1758 png_memcpy(dp, v, 6);
1759 dp -= 6;
1760 }
1761 sptr -= 6;
1762 }
1763 } /* end of pixel_bytes == 6 */
1764
1765 else
1766 {
1767 for (i = width; i; i--)
1768 {
1769 png_byte v[8];
1770 int j;
1771 png_memcpy(v, sptr, pixel_bytes);
1772 for (j = 0; j < png_pass_inc[pass]; j++)
1773 {
1774 png_memcpy(dp, v, pixel_bytes);
1775 dp -= pixel_bytes;
1776 }
1777 sptr-= pixel_bytes;
1778 }
1779 }
1780 } /* end of mmx_supported */
1781
1782 else /* MMX not supported: use modified C code - takes advantage
1783 * of inlining of memcpy for a constant */
1784 {
1785 if (pixel_bytes == 1)
1786 {
1787 for (i = width; i; i--)
1788 {
1789 int j;
1790 for (j = 0; j < png_pass_inc[pass]; j++)
1791 *dp-- = *sptr;
1792 sptr--;
1793 }
1794 }
1795 else if (pixel_bytes == 3)
1796 {
1797 for (i = width; i; i--)
1798 {
1799 png_byte v[8];
1800 int j;
1801 png_memcpy(v, sptr, pixel_bytes);
1802 for (j = 0; j < png_pass_inc[pass]; j++)
1803 {
1804 png_memcpy(dp, v, pixel_bytes);
1805 dp -= pixel_bytes;
1806 }
1807 sptr -= pixel_bytes;
1808 }
1809 }
1810 else if (pixel_bytes == 2)
1811 {
1812 for (i = width; i; i--)
1813 {
1814 png_byte v[8];
1815 int j;
1816 png_memcpy(v, sptr, pixel_bytes);
1817 for (j = 0; j < png_pass_inc[pass]; j++)
1818 {
1819 png_memcpy(dp, v, pixel_bytes);
1820 dp -= pixel_bytes;
1821 }
1822 sptr -= pixel_bytes;
1823 }
1824 }
1825 else if (pixel_bytes == 4)
1826 {
1827 for (i = width; i; i--)
1828 {
1829 png_byte v[8];
1830 int j;
1831 png_memcpy(v, sptr, pixel_bytes);
1832 for (j = 0; j < png_pass_inc[pass]; j++)
1833 {
1834 png_memcpy(dp, v, pixel_bytes);
1835 dp -= pixel_bytes;
1836 }
1837 sptr -= pixel_bytes;
1838 }
1839 }
1840 else if (pixel_bytes == 6)
1841 {
1842 for (i = width; i; i--)
1843 {
1844 png_byte v[8];
1845 int j;
1846 png_memcpy(v, sptr, pixel_bytes);
1847 for (j = 0; j < png_pass_inc[pass]; j++)
1848 {
1849 png_memcpy(dp, v, pixel_bytes);
1850 dp -= pixel_bytes;
1851 }
1852 sptr -= pixel_bytes;
1853 }
1854 }
1855 else
1856 {
1857 for (i = width; i; i--)
1858 {
1859 png_byte v[8];
1860 int j;
1861 png_memcpy(v, sptr, pixel_bytes);
1862 for (j = 0; j < png_pass_inc[pass]; j++)
1863 {
1864 png_memcpy(dp, v, pixel_bytes);
1865 dp -= pixel_bytes;
1866 }
1867 sptr -= pixel_bytes;
1868 }
1869 }
1870
1871 } /* end of MMX not supported */
1872 break;
1873 }
1874 } /* end switch (row_info->pixel_depth) */
1875
1876 row_info->width = final_width;
1877 row_info->rowbytes = ((final_width *
1878 (png_uint_32)row_info->pixel_depth + 7) >> 3);
1879 }
1880
1881 }
1882
1883 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1884
1885
1886 // These variables are utilized in the functions below. They are declared
1887 // globally here to ensure alignment on 8-byte boundaries.
1888
1889 union uAll {
1890 __int64 use;
1891 double align;
1892 } LBCarryMask = {0x0101010101010101},
1893 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1894 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1895
1896
1897 // Optimized code for PNG Average filter decoder
1898 void /* PRIVATE */
1899 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1900 , png_bytep prev_row)
1901 {
1902 int bpp;
1903 png_uint_32 FullLength;
1904 png_uint_32 MMXLength;
1905 //png_uint_32 len;
1906 int diff;
1907
1908 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
1909 FullLength = row_info->rowbytes; // # of bytes to filter
1910 _asm {
1911 // Init address pointers and offset
1912 mov edi, row // edi ==> Avg(x)
1913 xor ebx, ebx // ebx ==> x
1914 mov edx, edi
1915 mov esi, prev_row // esi ==> Prior(x)
1916 sub edx, bpp // edx ==> Raw(x-bpp)
1917
1918 xor eax, eax
1919 // Compute the Raw value for the first bpp bytes
1920 // Raw(x) = Avg(x) + (Prior(x)/2)
1921 davgrlp:
1922 mov al, [esi + ebx] // Load al with Prior(x)
1923 inc ebx
1924 shr al, 1 // divide by 2
1925 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1926 cmp ebx, bpp
1927 mov [edi+ebx-1], al // Write back Raw(x);
1928 // mov does not affect flags; -1 to offset inc ebx
1929 jb davgrlp
1930 // get # of bytes to alignment
1931 mov diff, edi // take start of row
1932 add diff, ebx // add bpp
1933 add diff, 0xf // add 7 + 8 to incr past alignment boundary
1934 and diff, 0xfffffff8 // mask to alignment boundary
1935 sub diff, edi // subtract from start ==> value ebx at alignment
1936 jz davggo
1937 // fix alignment
1938 // Compute the Raw value for the bytes upto the alignment boundary
1939 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
1940 xor ecx, ecx
1941 davglp1:
1942 xor eax, eax
1943 mov cl, [esi + ebx] // load cl with Prior(x)
1944 mov al, [edx + ebx] // load al with Raw(x-bpp)
1945 add ax, cx
1946 inc ebx
1947 shr ax, 1 // divide by 2
1948 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
1949 cmp ebx, diff // Check if at alignment boundary
1950 mov [edi+ebx-1], al // Write back Raw(x);
1951 // mov does not affect flags; -1 to offset inc ebx
1952 jb davglp1 // Repeat until at alignment boundary
1953 davggo:
1954 mov eax, FullLength
1955 mov ecx, eax
1956 sub eax, ebx // subtract alignment fix
1957 and eax, 0x00000007 // calc bytes over mult of 8
1958 sub ecx, eax // drop over bytes from original length
1959 mov MMXLength, ecx
1960 } // end _asm block
1961 // Now do the math for the rest of the row
1962 switch ( bpp )
1963 {
1964 case 3:
1965 {
1966 ActiveMask.use = 0x0000000000ffffff;
1967 ShiftBpp.use = 24; // == 3 * 8
1968 ShiftRem.use = 40; // == 64 - 24
1969 _asm {
1970 // Re-init address pointers and offset
1971 movq mm7, ActiveMask
1972 mov ebx, diff // ebx ==> x = offset to alignment boundary
1973 movq mm5, LBCarryMask
1974 mov edi, row // edi ==> Avg(x)
1975 movq mm4, HBClearMask
1976 mov esi, prev_row // esi ==> Prior(x)
1977 // PRIME the pump (load the first Raw(x-bpp) data set
1978 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
1979 // (we correct position in loop below)
1980 davg3lp:
1981 movq mm0, [edi + ebx] // Load mm0 with Avg(x)
1982 // Add (Prev_row/2) to Average
1983 movq mm3, mm5
1984 psrlq mm2, ShiftRem // Correct position Raw(x-bpp) data
1985 movq mm1, [esi + ebx] // Load mm1 with Prior(x)
1986 movq mm6, mm7
1987 pand mm3, mm1 // get lsb for each prev_row byte
1988 psrlq mm1, 1 // divide prev_row bytes by 2
1989 pand mm1, mm4 // clear invalid bit 7 of each byte
1990 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
1991 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
1992 movq mm1, mm3 // now use mm1 for getting LBCarrys
1993 pand mm1, mm2 // get LBCarrys for each byte where both
1994 // lsb's were == 1 (Only valid for active group)
1995 psrlq mm2, 1 // divide raw bytes by 2
1996 pand mm2, mm4 // clear invalid bit 7 of each byte
1997 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
1998 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
1999 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2000 // byte
2001 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2002 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 3-5
2003 movq mm2, mm0 // mov updated Raws to mm2
2004 psllq mm2, ShiftBpp // shift data to position correctly
2005 movq mm1, mm3 // now use mm1 for getting LBCarrys
2006 pand mm1, mm2 // get LBCarrys for each byte where both
2007 // lsb's were == 1 (Only valid for active group)
2008 psrlq mm2, 1 // divide raw bytes by 2
2009 pand mm2, mm4 // clear invalid bit 7 of each byte
2010 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2011 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2012 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2013 // byte
2014
2015 // Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry
2016 psllq mm6, ShiftBpp // shift the mm6 mask to cover the last two
2017 // bytes
2018 movq mm2, mm0 // mov updated Raws to mm2
2019 psllq mm2, ShiftBpp // shift data to position correctly
2020 // Data only needs to be shifted once here to
2021 // get the correct x-bpp offset.
2022 movq mm1, mm3 // now use mm1 for getting LBCarrys
2023 pand mm1, mm2 // get LBCarrys for each byte where both
2024 // lsb's were == 1 (Only valid for active group)
2025 psrlq mm2, 1 // divide raw bytes by 2
2026 pand mm2, mm4 // clear invalid bit 7 of each byte
2027 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2028 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2029 add ebx, 8
2030 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2031 // byte
2032
2033 // Now ready to write back to memory
2034 movq [edi + ebx - 8], mm0
2035 // Move updated Raw(x) to use as Raw(x-bpp) for next loop
2036 cmp ebx, MMXLength
2037 movq mm2, mm0 // mov updated Raw(x) to mm2
2038 jb davg3lp
2039 } // end _asm block
2040 }
2041 break;
2042
2043 case 6:
2044 case 4:
2045 case 7:
2046 case 5:
2047 {
2048 ActiveMask.use = 0xffffffffffffffff; // use shift below to clear
2049 // appropriate inactive bytes
2050 ShiftBpp.use = bpp << 3;
2051 ShiftRem.use = 64 - ShiftBpp.use;
2052 _asm {
2053 movq mm4, HBClearMask
2054 // Re-init address pointers and offset
2055 mov ebx, diff // ebx ==> x = offset to alignment boundary
2056 // Load ActiveMask and clear all bytes except for 1st active group
2057 movq mm7, ActiveMask
2058 mov edi, row // edi ==> Avg(x)
2059 psrlq mm7, ShiftRem
2060 mov esi, prev_row // esi ==> Prior(x)
2061 movq mm6, mm7
2062 movq mm5, LBCarryMask
2063 psllq mm6, ShiftBpp // Create mask for 2nd active group
2064 // PRIME the pump (load the first Raw(x-bpp) data set
2065 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2066 // (we correct position in loop below)
2067 davg4lp:
2068 movq mm0, [edi + ebx]
2069 psrlq mm2, ShiftRem // shift data to position correctly
2070 movq mm1, [esi + ebx]
2071 // Add (Prev_row/2) to Average
2072 movq mm3, mm5
2073 pand mm3, mm1 // get lsb for each prev_row byte
2074 psrlq mm1, 1 // divide prev_row bytes by 2
2075 pand mm1, mm4 // clear invalid bit 7 of each byte
2076 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2077 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2078 movq mm1, mm3 // now use mm1 for getting LBCarrys
2079 pand mm1, mm2 // get LBCarrys for each byte where both
2080 // lsb's were == 1 (Only valid for active group)
2081 psrlq mm2, 1 // divide raw bytes by 2
2082 pand mm2, mm4 // clear invalid bit 7 of each byte
2083 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2084 pand mm2, mm7 // Leave only Active Group 1 bytes to add to Avg
2085 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2086 // byte
2087 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2088 movq mm2, mm0 // mov updated Raws to mm2
2089 psllq mm2, ShiftBpp // shift data to position correctly
2090 add ebx, 8
2091 movq mm1, mm3 // now use mm1 for getting LBCarrys
2092 pand mm1, mm2 // get LBCarrys for each byte where both
2093 // lsb's were == 1 (Only valid for active group)
2094 psrlq mm2, 1 // divide raw bytes by 2
2095 pand mm2, mm4 // clear invalid bit 7 of each byte
2096 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2097 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2098 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active
2099 // byte
2100 cmp ebx, MMXLength
2101 // Now ready to write back to memory
2102 movq [edi + ebx - 8], mm0
2103 // Prep Raw(x-bpp) for next loop
2104 movq mm2, mm0 // mov updated Raws to mm2
2105 jb davg4lp
2106 } // end _asm block
2107 }
2108 break;
2109 case 2:
2110 {
2111 ActiveMask.use = 0x000000000000ffff;
2112 ShiftBpp.use = 16; // == 2 * 8 [BUGFIX]
2113 ShiftRem.use = 48; // == 64 - 16 [BUGFIX]
2114 _asm {
2115 // Load ActiveMask
2116 movq mm7, ActiveMask
2117 // Re-init address pointers and offset
2118 mov ebx, diff // ebx ==> x = offset to alignment boundary
2119 movq mm5, LBCarryMask
2120 mov edi, row // edi ==> Avg(x)
2121 movq mm4, HBClearMask
2122 mov esi, prev_row // esi ==> Prior(x)
2123 // PRIME the pump (load the first Raw(x-bpp) data set
2124 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2125 // (we correct position in loop below)
2126 davg2lp:
2127 movq mm0, [edi + ebx]
2128 psrlq mm2, ShiftRem // shift data to position correctly [BUGFIX]
2129 movq mm1, [esi + ebx]
2130 // Add (Prev_row/2) to Average
2131 movq mm3, mm5
2132 pand mm3, mm1 // get lsb for each prev_row byte
2133 psrlq mm1, 1 // divide prev_row bytes by 2
2134 pand mm1, mm4 // clear invalid bit 7 of each byte
2135 movq mm6, mm7
2136 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2137 // Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry
2138 movq mm1, mm3 // now use mm1 for getting LBCarrys
2139 pand mm1, mm2 // get LBCarrys for each byte where both
2140 // lsb's were == 1 (Only valid for active group)
2141 psrlq mm2, 1 // divide raw bytes by 2
2142 pand mm2, mm4 // clear invalid bit 7 of each byte
2143 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2144 pand mm2, mm6 // Leave only Active Group 1 bytes to add to Avg
2145 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2146 // Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry
2147 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 2 & 3
2148 movq mm2, mm0 // mov updated Raws to mm2
2149 psllq mm2, ShiftBpp // shift data to position correctly
2150 movq mm1, mm3 // now use mm1 for getting LBCarrys
2151 pand mm1, mm2 // get LBCarrys for each byte where both
2152 // lsb's were == 1 (Only valid for active group)
2153 psrlq mm2, 1 // divide raw bytes by 2
2154 pand mm2, mm4 // clear invalid bit 7 of each byte
2155 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2156 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2157 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2158
2159 // Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry
2160 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 4 & 5
2161 movq mm2, mm0 // mov updated Raws to mm2
2162 psllq mm2, ShiftBpp // shift data to position correctly
2163 // Data only needs to be shifted once here to
2164 // get the correct x-bpp offset.
2165 movq mm1, mm3 // now use mm1 for getting LBCarrys
2166 pand mm1, mm2 // get LBCarrys for each byte where both
2167 // lsb's were == 1 (Only valid for active group)
2168 psrlq mm2, 1 // divide raw bytes by 2
2169 pand mm2, mm4 // clear invalid bit 7 of each byte
2170 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2171 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2172 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2173
2174 // Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry
2175 psllq mm6, ShiftBpp // shift the mm6 mask to cover bytes 6 & 7
2176 movq mm2, mm0 // mov updated Raws to mm2
2177 psllq mm2, ShiftBpp // shift data to position correctly
2178 // Data only needs to be shifted once here to
2179 // get the correct x-bpp offset.
2180 add ebx, 8
2181 movq mm1, mm3 // now use mm1 for getting LBCarrys
2182 pand mm1, mm2 // get LBCarrys for each byte where both
2183 // lsb's were == 1 (Only valid for active group)
2184 psrlq mm2, 1 // divide raw bytes by 2
2185 pand mm2, mm4 // clear invalid bit 7 of each byte
2186 paddb mm2, mm1 // add LBCarrys to (Raw(x-bpp)/2) for each byte
2187 pand mm2, mm6 // Leave only Active Group 2 bytes to add to Avg
2188 paddb mm0, mm2 // add (Raw/2) + LBCarrys to Avg for each Active byte
2189
2190 cmp ebx, MMXLength
2191 // Now ready to write back to memory
2192 movq [edi + ebx - 8], mm0
2193 // Prep Raw(x-bpp) for next loop
2194 movq mm2, mm0 // mov updated Raws to mm2
2195 jb davg2lp
2196 } // end _asm block
2197 }
2198 break;
2199
2200 case 1: // bpp == 1
2201 {
2202 _asm {
2203 // Re-init address pointers and offset
2204 mov ebx, diff // ebx ==> x = offset to alignment boundary
2205 mov edi, row // edi ==> Avg(x)
2206 cmp ebx, FullLength // Test if offset at end of array
2207 jnb davg1end
2208 // Do Paeth decode for remaining bytes
2209 mov esi, prev_row // esi ==> Prior(x)
2210 mov edx, edi
2211 xor ecx, ecx // zero ecx before using cl & cx in loop below
2212 sub edx, bpp // edx ==> Raw(x-bpp)
2213 davg1lp:
2214 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2215 xor eax, eax
2216 mov cl, [esi + ebx] // load cl with Prior(x)
2217 mov al, [edx + ebx] // load al with Raw(x-bpp)
2218 add ax, cx
2219 inc ebx
2220 shr ax, 1 // divide by 2
2221 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2222 cmp ebx, FullLength // Check if at end of array
2223 mov [edi+ebx-1], al // Write back Raw(x);
2224 // mov does not affect flags; -1 to offset inc ebx
2225 jb davg1lp
2226 davg1end:
2227 } // end _asm block
2228 }
2229 return;
2230
2231 case 8: // bpp == 8
2232 {
2233 _asm {
2234 // Re-init address pointers and offset
2235 mov ebx, diff // ebx ==> x = offset to alignment boundary
2236 movq mm5, LBCarryMask
2237 mov edi, row // edi ==> Avg(x)
2238 movq mm4, HBClearMask
2239 mov esi, prev_row // esi ==> Prior(x)
2240 // PRIME the pump (load the first Raw(x-bpp) data set
2241 movq mm2, [edi + ebx - 8] // Load previous aligned 8 bytes
2242 // (NO NEED to correct position in loop below)
2243 davg8lp:
2244 movq mm0, [edi + ebx]
2245 movq mm3, mm5
2246 movq mm1, [esi + ebx]
2247 add ebx, 8
2248 pand mm3, mm1 // get lsb for each prev_row byte
2249 psrlq mm1, 1 // divide prev_row bytes by 2
2250 pand mm3, mm2 // get LBCarrys for each byte where both
2251 // lsb's were == 1
2252 psrlq mm2, 1 // divide raw bytes by 2
2253 pand mm1, mm4 // clear invalid bit 7 of each byte
2254 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2255 pand mm2, mm4 // clear invalid bit 7 of each byte
2256 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2257 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2258 cmp ebx, MMXLength
2259 movq [edi + ebx - 8], mm0
2260 movq mm2, mm0 // reuse as Raw(x-bpp)
2261 jb davg8lp
2262 } // end _asm block
2263 }
2264 break;
2265 default: // bpp greater than 8
2266 {
2267 _asm {
2268 movq mm5, LBCarryMask
2269 // Re-init address pointers and offset
2270 mov ebx, diff // ebx ==> x = offset to alignment boundary
2271 mov edi, row // edi ==> Avg(x)
2272 movq mm4, HBClearMask
2273 mov edx, edi
2274 mov esi, prev_row // esi ==> Prior(x)
2275 sub edx, bpp // edx ==> Raw(x-bpp)
2276 davgAlp:
2277 movq mm0, [edi + ebx]
2278 movq mm3, mm5
2279 movq mm1, [esi + ebx]
2280 pand mm3, mm1 // get lsb for each prev_row byte
2281 movq mm2, [edx + ebx]
2282 psrlq mm1, 1 // divide prev_row bytes by 2
2283 pand mm3, mm2 // get LBCarrys for each byte where both
2284 // lsb's were == 1
2285 psrlq mm2, 1 // divide raw bytes by 2
2286 pand mm1, mm4 // clear invalid bit 7 of each byte
2287 paddb mm0, mm3 // add LBCarrys to Avg for each byte
2288 pand mm2, mm4 // clear invalid bit 7 of each byte
2289 paddb mm0, mm1 // add (Prev_row/2) to Avg for each byte
2290 add ebx, 8
2291 paddb mm0, mm2 // add (Raw/2) to Avg for each byte
2292 cmp ebx, MMXLength
2293 movq [edi + ebx - 8], mm0
2294 jb davgAlp
2295 } // end _asm block
2296 }
2297 break;
2298 } // end switch ( bpp )
2299
2300 _asm {
2301 // MMX acceleration complete now do clean-up
2302 // Check if any remaining bytes left to decode
2303 mov ebx, MMXLength // ebx ==> x = offset bytes remaining after MMX
2304 mov edi, row // edi ==> Avg(x)
2305 cmp ebx, FullLength // Test if offset at end of array
2306 jnb davgend
2307 // Do Paeth decode for remaining bytes
2308 mov esi, prev_row // esi ==> Prior(x)
2309 mov edx, edi
2310 xor ecx, ecx // zero ecx before using cl & cx in loop below
2311 sub edx, bpp // edx ==> Raw(x-bpp)
2312 davglp2:
2313 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2314 xor eax, eax
2315 mov cl, [esi + ebx] // load cl with Prior(x)
2316 mov al, [edx + ebx] // load al with Raw(x-bpp)
2317 add ax, cx
2318 inc ebx
2319 shr ax, 1 // divide by 2
2320 add al, [edi+ebx-1] // Add Avg(x); -1 to offset inc ebx
2321 cmp ebx, FullLength // Check if at end of array
2322 mov [edi+ebx-1], al // Write back Raw(x);
2323 // mov does not affect flags; -1 to offset inc ebx
2324 jb davglp2
2325 davgend:
2326 emms // End MMX instructions; prep for possible FP instrs.
2327 } // end _asm block
2328 }
2329
2330 // Optimized code for PNG Paeth filter decoder
2331 void /* PRIVATE */
2332 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2333 png_bytep prev_row)
2334 {
2335 png_uint_32 FullLength;
2336 png_uint_32 MMXLength;
2337 //png_uint_32 len;
2338 int bpp;
2339 int diff;
2340 //int ptemp;
2341 int patemp, pbtemp, pctemp;
2342
2343 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
2344 FullLength = row_info->rowbytes; // # of bytes to filter
2345 _asm
2346 {
2347 xor ebx, ebx // ebx ==> x offset
2348 mov edi, row
2349 xor edx, edx // edx ==> x-bpp offset
2350 mov esi, prev_row
2351 xor eax, eax
2352
2353 // Compute the Raw value for the first bpp bytes
2354 // Note: the formula works out to be always
2355 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
2356 dpthrlp:
2357 mov al, [edi + ebx]
2358 add al, [esi + ebx]
2359 inc ebx
2360 cmp ebx, bpp
2361 mov [edi + ebx - 1], al
2362 jb dpthrlp
2363 // get # of bytes to alignment
2364 mov diff, edi // take start of row
2365 add diff, ebx // add bpp
2366 xor ecx, ecx
2367 add diff, 0xf // add 7 + 8 to incr past alignment boundary
2368 and diff, 0xfffffff8 // mask to alignment boundary
2369 sub diff, edi // subtract from start ==> value ebx at alignment
2370 jz dpthgo
2371 // fix alignment
2372 dpthlp1:
2373 xor eax, eax
2374 // pav = p - a = (a + b - c) - a = b - c
2375 mov al, [esi + ebx] // load Prior(x) into al
2376 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2377 sub eax, ecx // subtract Prior(x-bpp)
2378 mov patemp, eax // Save pav for later use
2379 xor eax, eax
2380 // pbv = p - b = (a + b - c) - b = a - c
2381 mov al, [edi + edx] // load Raw(x-bpp) into al
2382 sub eax, ecx // subtract Prior(x-bpp)
2383 mov ecx, eax
2384 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2385 add eax, patemp // pcv = pav + pbv
2386 // pc = abs(pcv)
2387 test eax, 0x80000000
2388 jz dpthpca
2389 neg eax // reverse sign of neg values
2390 dpthpca:
2391 mov pctemp, eax // save pc for later use
2392 // pb = abs(pbv)
2393 test ecx, 0x80000000
2394 jz dpthpba
2395 neg ecx // reverse sign of neg values
2396 dpthpba:
2397 mov pbtemp, ecx // save pb for later use
2398 // pa = abs(pav)
2399 mov eax, patemp
2400 test eax, 0x80000000
2401 jz dpthpaa
2402 neg eax // reverse sign of neg values
2403 dpthpaa:
2404 mov patemp, eax // save pa for later use
2405 // test if pa <= pb
2406 cmp eax, ecx
2407 jna dpthabb
2408 // pa > pb; now test if pb <= pc
2409 cmp ecx, pctemp
2410 jna dpthbbc
2411 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2412 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2413 jmp dpthpaeth
2414 dpthbbc:
2415 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
2416 mov cl, [esi + ebx] // load Prior(x) into cl
2417 jmp dpthpaeth
2418 dpthabb:
2419 // pa <= pb; now test if pa <= pc
2420 cmp eax, pctemp
2421 jna dpthabc
2422 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
2423 mov cl, [esi + edx] // load Prior(x-bpp) into cl
2424 jmp dpthpaeth
2425 dpthabc:
2426 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
2427 mov cl, [edi + edx] // load Raw(x-bpp) into cl
2428 dpthpaeth:
2429 inc ebx
2430 inc edx
2431 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
2432 add [edi + ebx - 1], cl
2433 cmp ebx, diff
2434 jb dpthlp1
2435 dpthgo:
2436 mov ecx, FullLength
2437 mov eax, ecx
2438 sub eax, ebx // subtract alignment fix
2439 and eax, 0x00000007 // calc bytes over mult of 8
2440 sub ecx, eax // drop over bytes from original length
2441 mov MMXLength, ecx
2442 } // end _asm block
2443 // Now do the math for the rest of the row
2444 switch ( bpp )
2445 {
2446 case 3:
2447 {
2448 ActiveMask.use = 0x0000000000ffffff;
2449 ActiveMaskEnd.use = 0xffff000000000000;
2450 ShiftBpp.use = 24; // == bpp(3) * 8
2451 ShiftRem.use = 40; // == 64 - 24
2452 _asm
2453 {
2454 mov ebx, diff
2455 mov edi, row
2456 mov esi, prev_row
2457 pxor mm0, mm0
2458 // PRIME the pump (load the first Raw(x-bpp) data set
2459 movq mm1, [edi+ebx-8]
2460 dpth3lp:
2461 psrlq mm1, ShiftRem // shift last 3 bytes to 1st 3 bytes
2462 movq mm2, [esi + ebx] // load b=Prior(x)
2463 punpcklbw mm1, mm0 // Unpack High bytes of a
2464 movq mm3, [esi+ebx-8] // Prep c=Prior(x-bpp) bytes
2465 punpcklbw mm2, mm0 // Unpack High bytes of b
2466 psrlq mm3, ShiftRem // shift last 3 bytes to 1st 3 bytes
2467 // pav = p - a = (a + b - c) - a = b - c
2468 movq mm4, mm2
2469 punpcklbw mm3, mm0 // Unpack High bytes of c
2470 // pbv = p - b = (a + b - c) - b = a - c
2471 movq mm5, mm1
2472 psubw mm4, mm3
2473 pxor mm7, mm7
2474 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2475 movq mm6, mm4
2476 psubw mm5, mm3
2477
2478 // pa = abs(p-a) = abs(pav)
2479 // pb = abs(p-b) = abs(pbv)
2480 // pc = abs(p-c) = abs(pcv)
2481 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2482 paddw mm6, mm5
2483 pand mm0, mm4 // Only pav bytes < 0 in mm7
2484 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2485 psubw mm4, mm0
2486 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2487 psubw mm4, mm0
2488 psubw mm5, mm7
2489 pxor mm0, mm0
2490 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2491 pand mm0, mm6 // Only pav bytes < 0 in mm7
2492 psubw mm5, mm7
2493 psubw mm6, mm0
2494 // test pa <= pb
2495 movq mm7, mm4
2496 psubw mm6, mm0
2497 pcmpgtw mm7, mm5 // pa > pb?
2498 movq mm0, mm7
2499 // use mm7 mask to merge pa & pb
2500 pand mm5, mm7
2501 // use mm0 mask copy to merge a & b
2502 pand mm2, mm0
2503 pandn mm7, mm4
2504 pandn mm0, mm1
2505 paddw mm7, mm5
2506 paddw mm0, mm2
2507 // test ((pa <= pb)? pa:pb) <= pc
2508 pcmpgtw mm7, mm6 // pab > pc?
2509 pxor mm1, mm1
2510 pand mm3, mm7
2511 pandn mm7, mm0
2512 paddw mm7, mm3
2513 pxor mm0, mm0
2514 packuswb mm7, mm1
2515 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2516 pand mm7, ActiveMask
2517 movq mm2, mm3 // load b=Prior(x) step 1
2518 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2519 punpcklbw mm3, mm0 // Unpack High bytes of c
2520 movq [edi + ebx], mm7 // write back updated value
2521 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2522 // Now do Paeth for 2nd set of bytes (3-5)
2523 psrlq mm2, ShiftBpp // load b=Prior(x) step 2
2524 punpcklbw mm1, mm0 // Unpack High bytes of a
2525 pxor mm7, mm7
2526 punpcklbw mm2, mm0 // Unpack High bytes of b
2527 // pbv = p - b = (a + b - c) - b = a - c
2528 movq mm5, mm1
2529 // pav = p - a = (a + b - c) - a = b - c
2530 movq mm4, mm2
2531 psubw mm5, mm3
2532 psubw mm4, mm3
2533 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
2534 // pav + pbv = pbv + pav
2535 movq mm6, mm5
2536 paddw mm6, mm4
2537
2538 // pa = abs(p-a) = abs(pav)
2539 // pb = abs(p-b) = abs(pbv)
2540 // pc = abs(p-c) = abs(pcv)
2541 pcmpgtw mm0, mm5 // Create mask pbv bytes < 0
2542 pcmpgtw mm7, mm4 // Create mask pav bytes < 0
2543 pand mm0, mm5 // Only pbv bytes < 0 in mm0
2544 pand mm7, mm4 // Only pav bytes < 0 in mm7
2545 psubw mm5, mm0
2546 psubw mm4, mm7
2547 psubw mm5, mm0
2548 psubw mm4, mm7
2549 pxor mm0, mm0
2550 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2551 pand mm0, mm6 // Only pav bytes < 0 in mm7
2552 psubw mm6, mm0
2553 // test pa <= pb
2554 movq mm7, mm4
2555 psubw mm6, mm0
2556 pcmpgtw mm7, mm5 // pa > pb?
2557 movq mm0, mm7
2558 // use mm7 mask to merge pa & pb
2559 pand mm5, mm7
2560 // use mm0 mask copy to merge a & b
2561 pand mm2, mm0
2562 pandn mm7, mm4
2563 pandn mm0, mm1
2564 paddw mm7, mm5
2565 paddw mm0, mm2
2566 // test ((pa <= pb)? pa:pb) <= pc
2567 pcmpgtw mm7, mm6 // pab > pc?
2568 movq mm2, [esi + ebx] // load b=Prior(x)
2569 pand mm3, mm7
2570 pandn mm7, mm0
2571 pxor mm1, mm1
2572 paddw mm7, mm3
2573 pxor mm0, mm0
2574 packuswb mm7, mm1
2575 movq mm3, mm2 // load c=Prior(x-bpp) step 1
2576 pand mm7, ActiveMask
2577 punpckhbw mm2, mm0 // Unpack High bytes of b
2578 psllq mm7, ShiftBpp // Shift bytes to 2nd group of 3 bytes
2579 // pav = p - a = (a + b - c) - a = b - c
2580 movq mm4, mm2
2581 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2582 psllq mm3, ShiftBpp // load c=Prior(x-bpp) step 2
2583 movq [edi + ebx], mm7 // write back updated value
2584 movq mm1, mm7
2585 punpckhbw mm3, mm0 // Unpack High bytes of c
2586 psllq mm1, ShiftBpp // Shift bytes
2587 // Now mm1 will be used as Raw(x-bpp)
2588 // Now do Paeth for 3rd, and final, set of bytes (6-7)
2589 pxor mm7, mm7
2590 punpckhbw mm1, mm0 // Unpack High bytes of a
2591 psubw mm4, mm3
2592 // pbv = p - b = (a + b - c) - b = a - c
2593 movq mm5, mm1
2594 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2595 movq mm6, mm4
2596 psubw mm5, mm3
2597 pxor mm0, mm0
2598 paddw mm6, mm5
2599
2600 // pa = abs(p-a) = abs(pav)
2601 // pb = abs(p-b) = abs(pbv)
2602 // pc = abs(p-c) = abs(pcv)
2603 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2604 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2605 pand mm0, mm4 // Only pav bytes < 0 in mm7
2606 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2607 psubw mm4, mm0
2608 psubw mm5, mm7
2609 psubw mm4, mm0
2610 psubw mm5, mm7
2611 pxor mm0, mm0
2612 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2613 pand mm0, mm6 // Only pav bytes < 0 in mm7
2614 psubw mm6, mm0
2615 // test pa <= pb
2616 movq mm7, mm4
2617 psubw mm6, mm0
2618 pcmpgtw mm7, mm5 // pa > pb?
2619 movq mm0, mm7
2620 // use mm0 mask copy to merge a & b
2621 pand mm2, mm0
2622 // use mm7 mask to merge pa & pb
2623 pand mm5, mm7
2624 pandn mm0, mm1
2625 pandn mm7, mm4
2626 paddw mm0, mm2
2627 paddw mm7, mm5
2628 // test ((pa <= pb)? pa:pb) <= pc
2629 pcmpgtw mm7, mm6 // pab > pc?
2630 pand mm3, mm7
2631 pandn mm7, mm0
2632 paddw mm7, mm3
2633 pxor mm1, mm1
2634 packuswb mm1, mm7
2635 // Step ebx to next set of 8 bytes and repeat loop til done
2636 add ebx, 8
2637 pand mm1, ActiveMaskEnd
2638 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2639
2640 cmp ebx, MMXLength
2641 pxor mm0, mm0 // pxor does not affect flags
2642 movq [edi + ebx - 8], mm1 // write back updated value
2643 // mm1 will be used as Raw(x-bpp) next loop
2644 // mm3 ready to be used as Prior(x-bpp) next loop
2645 jb dpth3lp
2646 } // end _asm block
2647 }
2648 break;
2649
2650 case 6:
2651 case 7:
2652 case 5:
2653 {
2654 ActiveMask.use = 0x00000000ffffffff;
2655 ActiveMask2.use = 0xffffffff00000000;
2656 ShiftBpp.use = bpp << 3; // == bpp * 8
2657 ShiftRem.use = 64 - ShiftBpp.use;
2658 _asm
2659 {
2660 mov ebx, diff
2661 mov edi, row
2662 mov esi, prev_row
2663 // PRIME the pump (load the first Raw(x-bpp) data set
2664 movq mm1, [edi+ebx-8]
2665 pxor mm0, mm0
2666 dpth6lp:
2667 // Must shift to position Raw(x-bpp) data
2668 psrlq mm1, ShiftRem
2669 // Do first set of 4 bytes
2670 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2671 punpcklbw mm1, mm0 // Unpack Low bytes of a
2672 movq mm2, [esi + ebx] // load b=Prior(x)
2673 punpcklbw mm2, mm0 // Unpack Low bytes of b
2674 // Must shift to position Prior(x-bpp) data
2675 psrlq mm3, ShiftRem
2676 // pav = p - a = (a + b - c) - a = b - c
2677 movq mm4, mm2
2678 punpcklbw mm3, mm0 // Unpack Low bytes of c
2679 // pbv = p - b = (a + b - c) - b = a - c
2680 movq mm5, mm1
2681 psubw mm4, mm3
2682 pxor mm7, mm7
2683 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2684 movq mm6, mm4
2685 psubw mm5, mm3
2686 // pa = abs(p-a) = abs(pav)
2687 // pb = abs(p-b) = abs(pbv)
2688 // pc = abs(p-c) = abs(pcv)
2689 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2690 paddw mm6, mm5
2691 pand mm0, mm4 // Only pav bytes < 0 in mm7
2692 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2693 psubw mm4, mm0
2694 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2695 psubw mm4, mm0
2696 psubw mm5, mm7
2697 pxor mm0, mm0
2698 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2699 pand mm0, mm6 // Only pav bytes < 0 in mm7
2700 psubw mm5, mm7
2701 psubw mm6, mm0
2702 // test pa <= pb
2703 movq mm7, mm4
2704 psubw mm6, mm0
2705 pcmpgtw mm7, mm5 // pa > pb?
2706 movq mm0, mm7
2707 // use mm7 mask to merge pa & pb
2708 pand mm5, mm7
2709 // use mm0 mask copy to merge a & b
2710 pand mm2, mm0
2711 pandn mm7, mm4
2712 pandn mm0, mm1
2713 paddw mm7, mm5
2714 paddw mm0, mm2
2715 // test ((pa <= pb)? pa:pb) <= pc
2716 pcmpgtw mm7, mm6 // pab > pc?
2717 pxor mm1, mm1
2718 pand mm3, mm7
2719 pandn mm7, mm0
2720 paddw mm7, mm3
2721 pxor mm0, mm0
2722 packuswb mm7, mm1
2723 movq mm3, [esi + ebx - 8] // load c=Prior(x-bpp)
2724 pand mm7, ActiveMask
2725 psrlq mm3, ShiftRem
2726 movq mm2, [esi + ebx] // load b=Prior(x) step 1
2727 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2728 movq mm6, mm2
2729 movq [edi + ebx], mm7 // write back updated value
2730 movq mm1, [edi+ebx-8]
2731 psllq mm6, ShiftBpp
2732 movq mm5, mm7
2733 psrlq mm1, ShiftRem
2734 por mm3, mm6
2735 psllq mm5, ShiftBpp
2736 punpckhbw mm3, mm0 // Unpack High bytes of c
2737 por mm1, mm5
2738 // Do second set of 4 bytes
2739 punpckhbw mm2, mm0 // Unpack High bytes of b
2740 punpckhbw mm1, mm0 // Unpack High bytes of a
2741 // pav = p - a = (a + b - c) - a = b - c
2742 movq mm4, mm2
2743 // pbv = p - b = (a + b - c) - b = a - c
2744 movq mm5, mm1
2745 psubw mm4, mm3
2746 pxor mm7, mm7
2747 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2748 movq mm6, mm4
2749 psubw mm5, mm3
2750 // pa = abs(p-a) = abs(pav)
2751 // pb = abs(p-b) = abs(pbv)
2752 // pc = abs(p-c) = abs(pcv)
2753 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2754 paddw mm6, mm5
2755 pand mm0, mm4 // Only pav bytes < 0 in mm7
2756 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2757 psubw mm4, mm0
2758 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2759 psubw mm4, mm0
2760 psubw mm5, mm7
2761 pxor mm0, mm0
2762 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2763 pand mm0, mm6 // Only pav bytes < 0 in mm7
2764 psubw mm5, mm7
2765 psubw mm6, mm0
2766 // test pa <= pb
2767 movq mm7, mm4
2768 psubw mm6, mm0
2769 pcmpgtw mm7, mm5 // pa > pb?
2770 movq mm0, mm7
2771 // use mm7 mask to merge pa & pb
2772 pand mm5, mm7
2773 // use mm0 mask copy to merge a & b
2774 pand mm2, mm0
2775 pandn mm7, mm4
2776 pandn mm0, mm1
2777 paddw mm7, mm5
2778 paddw mm0, mm2
2779 // test ((pa <= pb)? pa:pb) <= pc
2780 pcmpgtw mm7, mm6 // pab > pc?
2781 pxor mm1, mm1
2782 pand mm3, mm7
2783 pandn mm7, mm0
2784 pxor mm1, mm1
2785 paddw mm7, mm3
2786 pxor mm0, mm0
2787 // Step ex to next set of 8 bytes and repeat loop til done
2788 add ebx, 8
2789 packuswb mm1, mm7
2790 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2791 cmp ebx, MMXLength
2792 movq [edi + ebx - 8], mm1 // write back updated value
2793 // mm1 will be used as Raw(x-bpp) next loop
2794 jb dpth6lp
2795 } // end _asm block
2796 }
2797 break;
2798
2799 case 4:
2800 {
2801 ActiveMask.use = 0x00000000ffffffff;
2802 _asm {
2803 mov ebx, diff
2804 mov edi, row
2805 mov esi, prev_row
2806 pxor mm0, mm0
2807 // PRIME the pump (load the first Raw(x-bpp) data set
2808 movq mm1, [edi+ebx-8] // Only time should need to read
2809 // a=Raw(x-bpp) bytes
2810 dpth4lp:
2811 // Do first set of 4 bytes
2812 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2813 punpckhbw mm1, mm0 // Unpack Low bytes of a
2814 movq mm2, [esi + ebx] // load b=Prior(x)
2815 punpcklbw mm2, mm0 // Unpack High bytes of b
2816 // pav = p - a = (a + b - c) - a = b - c
2817 movq mm4, mm2
2818 punpckhbw mm3, mm0 // Unpack High bytes of c
2819 // pbv = p - b = (a + b - c) - b = a - c
2820 movq mm5, mm1
2821 psubw mm4, mm3
2822 pxor mm7, mm7
2823 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2824 movq mm6, mm4
2825 psubw mm5, mm3
2826 // pa = abs(p-a) = abs(pav)
2827 // pb = abs(p-b) = abs(pbv)
2828 // pc = abs(p-c) = abs(pcv)
2829 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2830 paddw mm6, mm5
2831 pand mm0, mm4 // Only pav bytes < 0 in mm7
2832 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2833 psubw mm4, mm0
2834 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2835 psubw mm4, mm0
2836 psubw mm5, mm7
2837 pxor mm0, mm0
2838 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2839 pand mm0, mm6 // Only pav bytes < 0 in mm7
2840 psubw mm5, mm7
2841 psubw mm6, mm0
2842 // test pa <= pb
2843 movq mm7, mm4
2844 psubw mm6, mm0
2845 pcmpgtw mm7, mm5 // pa > pb?
2846 movq mm0, mm7
2847 // use mm7 mask to merge pa & pb
2848 pand mm5, mm7
2849 // use mm0 mask copy to merge a & b
2850 pand mm2, mm0
2851 pandn mm7, mm4
2852 pandn mm0, mm1
2853 paddw mm7, mm5
2854 paddw mm0, mm2
2855 // test ((pa <= pb)? pa:pb) <= pc
2856 pcmpgtw mm7, mm6 // pab > pc?
2857 pxor mm1, mm1
2858 pand mm3, mm7
2859 pandn mm7, mm0
2860 paddw mm7, mm3
2861 pxor mm0, mm0
2862 packuswb mm7, mm1
2863 movq mm3, [esi + ebx] // load c=Prior(x-bpp)
2864 pand mm7, ActiveMask
2865 movq mm2, mm3 // load b=Prior(x) step 1
2866 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2867 punpcklbw mm3, mm0 // Unpack High bytes of c
2868 movq [edi + ebx], mm7 // write back updated value
2869 movq mm1, mm7 // Now mm1 will be used as Raw(x-bpp)
2870 // Do second set of 4 bytes
2871 punpckhbw mm2, mm0 // Unpack Low bytes of b
2872 punpcklbw mm1, mm0 // Unpack Low bytes of a
2873 // pav = p - a = (a + b - c) - a = b - c
2874 movq mm4, mm2
2875 // pbv = p - b = (a + b - c) - b = a - c
2876 movq mm5, mm1
2877 psubw mm4, mm3
2878 pxor mm7, mm7
2879 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2880 movq mm6, mm4
2881 psubw mm5, mm3
2882 // pa = abs(p-a) = abs(pav)
2883 // pb = abs(p-b) = abs(pbv)
2884 // pc = abs(p-c) = abs(pcv)
2885 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2886 paddw mm6, mm5
2887 pand mm0, mm4 // Only pav bytes < 0 in mm7
2888 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2889 psubw mm4, mm0
2890 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2891 psubw mm4, mm0
2892 psubw mm5, mm7
2893 pxor mm0, mm0
2894 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2895 pand mm0, mm6 // Only pav bytes < 0 in mm7
2896 psubw mm5, mm7
2897 psubw mm6, mm0
2898 // test pa <= pb
2899 movq mm7, mm4
2900 psubw mm6, mm0
2901 pcmpgtw mm7, mm5 // pa > pb?
2902 movq mm0, mm7
2903 // use mm7 mask to merge pa & pb
2904 pand mm5, mm7
2905 // use mm0 mask copy to merge a & b
2906 pand mm2, mm0
2907 pandn mm7, mm4
2908 pandn mm0, mm1
2909 paddw mm7, mm5
2910 paddw mm0, mm2
2911 // test ((pa <= pb)? pa:pb) <= pc
2912 pcmpgtw mm7, mm6 // pab > pc?
2913 pxor mm1, mm1
2914 pand mm3, mm7
2915 pandn mm7, mm0
2916 pxor mm1, mm1
2917 paddw mm7, mm3
2918 pxor mm0, mm0
2919 // Step ex to next set of 8 bytes and repeat loop til done
2920 add ebx, 8
2921 packuswb mm1, mm7
2922 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
2923 cmp ebx, MMXLength
2924 movq [edi + ebx - 8], mm1 // write back updated value
2925 // mm1 will be used as Raw(x-bpp) next loop
2926 jb dpth4lp
2927 } // end _asm block
2928 }
2929 break;
2930 case 8: // bpp == 8
2931 {
2932 ActiveMask.use = 0x00000000ffffffff;
2933 _asm {
2934 mov ebx, diff
2935 mov edi, row
2936 mov esi, prev_row
2937 pxor mm0, mm0
2938 // PRIME the pump (load the first Raw(x-bpp) data set
2939 movq mm1, [edi+ebx-8] // Only time should need to read
2940 // a=Raw(x-bpp) bytes
2941 dpth8lp:
2942 // Do first set of 4 bytes
2943 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2944 punpcklbw mm1, mm0 // Unpack Low bytes of a
2945 movq mm2, [esi + ebx] // load b=Prior(x)
2946 punpcklbw mm2, mm0 // Unpack Low bytes of b
2947 // pav = p - a = (a + b - c) - a = b - c
2948 movq mm4, mm2
2949 punpcklbw mm3, mm0 // Unpack Low bytes of c
2950 // pbv = p - b = (a + b - c) - b = a - c
2951 movq mm5, mm1
2952 psubw mm4, mm3
2953 pxor mm7, mm7
2954 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
2955 movq mm6, mm4
2956 psubw mm5, mm3
2957 // pa = abs(p-a) = abs(pav)
2958 // pb = abs(p-b) = abs(pbv)
2959 // pc = abs(p-c) = abs(pcv)
2960 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
2961 paddw mm6, mm5
2962 pand mm0, mm4 // Only pav bytes < 0 in mm7
2963 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
2964 psubw mm4, mm0
2965 pand mm7, mm5 // Only pbv bytes < 0 in mm0
2966 psubw mm4, mm0
2967 psubw mm5, mm7
2968 pxor mm0, mm0
2969 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
2970 pand mm0, mm6 // Only pav bytes < 0 in mm7
2971 psubw mm5, mm7
2972 psubw mm6, mm0
2973 // test pa <= pb
2974 movq mm7, mm4
2975 psubw mm6, mm0
2976 pcmpgtw mm7, mm5 // pa > pb?
2977 movq mm0, mm7
2978 // use mm7 mask to merge pa & pb
2979 pand mm5, mm7
2980 // use mm0 mask copy to merge a & b
2981 pand mm2, mm0
2982 pandn mm7, mm4
2983 pandn mm0, mm1
2984 paddw mm7, mm5
2985 paddw mm0, mm2
2986 // test ((pa <= pb)? pa:pb) <= pc
2987 pcmpgtw mm7, mm6 // pab > pc?
2988 pxor mm1, mm1
2989 pand mm3, mm7
2990 pandn mm7, mm0
2991 paddw mm7, mm3
2992 pxor mm0, mm0
2993 packuswb mm7, mm1
2994 movq mm3, [esi+ebx-8] // read c=Prior(x-bpp) bytes
2995 pand mm7, ActiveMask
2996 movq mm2, [esi + ebx] // load b=Prior(x)
2997 paddb mm7, [edi + ebx] // add Paeth predictor with Raw(x)
2998 punpckhbw mm3, mm0 // Unpack High bytes of c
2999 movq [edi + ebx], mm7 // write back updated value
3000 movq mm1, [edi+ebx-8] // read a=Raw(x-bpp) bytes
3001
3002 // Do second set of 4 bytes
3003 punpckhbw mm2, mm0 // Unpack High bytes of b
3004 punpckhbw mm1, mm0 // Unpack High bytes of a
3005 // pav = p - a = (a + b - c) - a = b - c
3006 movq mm4, mm2
3007 // pbv = p - b = (a + b - c) - b = a - c
3008 movq mm5, mm1
3009 psubw mm4, mm3
3010 pxor mm7, mm7
3011 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3012 movq mm6, mm4
3013 psubw mm5, mm3
3014 // pa = abs(p-a) = abs(pav)
3015 // pb = abs(p-b) = abs(pbv)
3016 // pc = abs(p-c) = abs(pcv)
3017 pcmpgtw mm0, mm4 // Create mask pav bytes < 0
3018 paddw mm6, mm5
3019 pand mm0, mm4 // Only pav bytes < 0 in mm7
3020 pcmpgtw mm7, mm5 // Create mask pbv bytes < 0
3021 psubw mm4, mm0
3022 pand mm7, mm5 // Only pbv bytes < 0 in mm0
3023 psubw mm4, mm0
3024 psubw mm5, mm7
3025 pxor mm0, mm0
3026 pcmpgtw mm0, mm6 // Create mask pcv bytes < 0
3027 pand mm0, mm6 // Only pav bytes < 0 in mm7
3028 psubw mm5, mm7
3029 psubw mm6, mm0
3030 // test pa <= pb
3031 movq mm7, mm4
3032 psubw mm6, mm0
3033 pcmpgtw mm7, mm5 // pa > pb?
3034 movq mm0, mm7
3035 // use mm7 mask to merge pa & pb
3036 pand mm5, mm7
3037 // use mm0 mask copy to merge a & b
3038 pand mm2, mm0
3039 pandn mm7, mm4
3040 pandn mm0, mm1
3041 paddw mm7, mm5
3042 paddw mm0, mm2
3043 // test ((pa <= pb)? pa:pb) <= pc
3044 pcmpgtw mm7, mm6 // pab > pc?
3045 pxor mm1, mm1
3046 pand mm3, mm7
3047 pandn mm7, mm0
3048 pxor mm1, mm1
3049 paddw mm7, mm3
3050 pxor mm0, mm0
3051 // Step ex to next set of 8 bytes and repeat loop til done
3052 add ebx, 8
3053 packuswb mm1, mm7
3054 paddb mm1, [edi + ebx - 8] // add Paeth predictor with Raw(x)
3055 cmp ebx, MMXLength
3056 movq [edi + ebx - 8], mm1 // write back updated value
3057 // mm1 will be used as Raw(x-bpp) next loop
3058 jb dpth8lp
3059 } // end _asm block
3060 }
3061 break;
3062
3063 case 1: // bpp = 1
3064 case 2: // bpp = 2
3065 default: // bpp > 8
3066 {
3067 _asm {
3068 mov ebx, diff
3069 cmp ebx, FullLength
3070 jnb dpthdend
3071 mov edi, row
3072 mov esi, prev_row
3073 // Do Paeth decode for remaining bytes
3074 mov edx, ebx
3075 xor ecx, ecx // zero ecx before using cl & cx in loop below
3076 sub edx, bpp // Set edx = ebx - bpp
3077 dpthdlp:
3078 xor eax, eax
3079 // pav = p - a = (a + b - c) - a = b - c
3080 mov al, [esi + ebx] // load Prior(x) into al
3081 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3082 sub eax, ecx // subtract Prior(x-bpp)
3083 mov patemp, eax // Save pav for later use
3084 xor eax, eax
3085 // pbv = p - b = (a + b - c) - b = a - c
3086 mov al, [edi + edx] // load Raw(x-bpp) into al
3087 sub eax, ecx // subtract Prior(x-bpp)
3088 mov ecx, eax
3089 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3090 add eax, patemp // pcv = pav + pbv
3091 // pc = abs(pcv)
3092 test eax, 0x80000000
3093 jz dpthdpca
3094 neg eax // reverse sign of neg values
3095 dpthdpca:
3096 mov pctemp, eax // save pc for later use
3097 // pb = abs(pbv)
3098 test ecx, 0x80000000
3099 jz dpthdpba
3100 neg ecx // reverse sign of neg values
3101 dpthdpba:
3102 mov pbtemp, ecx // save pb for later use
3103 // pa = abs(pav)
3104 mov eax, patemp
3105 test eax, 0x80000000
3106 jz dpthdpaa
3107 neg eax // reverse sign of neg values
3108 dpthdpaa:
3109 mov patemp, eax // save pa for later use
3110 // test if pa <= pb
3111 cmp eax, ecx
3112 jna dpthdabb
3113 // pa > pb; now test if pb <= pc
3114 cmp ecx, pctemp
3115 jna dpthdbbc
3116 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3117 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3118 jmp dpthdpaeth
3119 dpthdbbc:
3120 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3121 mov cl, [esi + ebx] // load Prior(x) into cl
3122 jmp dpthdpaeth
3123 dpthdabb:
3124 // pa <= pb; now test if pa <= pc
3125 cmp eax, pctemp
3126 jna dpthdabc
3127 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3128 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3129 jmp dpthdpaeth
3130 dpthdabc:
3131 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3132 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3133 dpthdpaeth:
3134 inc ebx
3135 inc edx
3136 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3137 add [edi + ebx - 1], cl
3138 cmp ebx, FullLength
3139 jb dpthdlp
3140 dpthdend:
3141 } // end _asm block
3142 }
3143 return; // No need to go further with this one
3144 } // end switch ( bpp )
3145 _asm
3146 {
3147 // MMX acceleration complete now do clean-up
3148 // Check if any remaining bytes left to decode
3149 mov ebx, MMXLength
3150 cmp ebx, FullLength
3151 jnb dpthend
3152 mov edi, row
3153 mov esi, prev_row
3154 // Do Paeth decode for remaining bytes
3155 mov edx, ebx
3156 xor ecx, ecx // zero ecx before using cl & cx in loop below
3157 sub edx, bpp // Set edx = ebx - bpp
3158 dpthlp2:
3159 xor eax, eax
3160 // pav = p - a = (a + b - c) - a = b - c
3161 mov al, [esi + ebx] // load Prior(x) into al
3162 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3163 sub eax, ecx // subtract Prior(x-bpp)
3164 mov patemp, eax // Save pav for later use
3165 xor eax, eax
3166 // pbv = p - b = (a + b - c) - b = a - c
3167 mov al, [edi + edx] // load Raw(x-bpp) into al
3168 sub eax, ecx // subtract Prior(x-bpp)
3169 mov ecx, eax
3170 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3171 add eax, patemp // pcv = pav + pbv
3172 // pc = abs(pcv)
3173 test eax, 0x80000000
3174 jz dpthpca2
3175 neg eax // reverse sign of neg values
3176 dpthpca2:
3177 mov pctemp, eax // save pc for later use
3178 // pb = abs(pbv)
3179 test ecx, 0x80000000
3180 jz dpthpba2
3181 neg ecx // reverse sign of neg values
3182 dpthpba2:
3183 mov pbtemp, ecx // save pb for later use
3184 // pa = abs(pav)
3185 mov eax, patemp
3186 test eax, 0x80000000
3187 jz dpthpaa2
3188 neg eax // reverse sign of neg values
3189 dpthpaa2:
3190 mov patemp, eax // save pa for later use
3191 // test if pa <= pb
3192 cmp eax, ecx
3193 jna dpthabb2
3194 // pa > pb; now test if pb <= pc
3195 cmp ecx, pctemp
3196 jna dpthbbc2
3197 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3198 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3199 jmp dpthpaeth2
3200 dpthbbc2:
3201 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3202 mov cl, [esi + ebx] // load Prior(x) into cl
3203 jmp dpthpaeth2
3204 dpthabb2:
3205 // pa <= pb; now test if pa <= pc
3206 cmp eax, pctemp
3207 jna dpthabc2
3208 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3209 mov cl, [esi + edx] // load Prior(x-bpp) into cl
3210 jmp dpthpaeth2
3211 dpthabc2:
3212 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3213 mov cl, [edi + edx] // load Raw(x-bpp) into cl
3214 dpthpaeth2:
3215 inc ebx
3216 inc edx
3217 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3218 add [edi + ebx - 1], cl
3219 cmp ebx, FullLength
3220 jb dpthlp2
3221 dpthend:
3222 emms // End MMX instructions; prep for possible FP instrs.
3223 } // end _asm block
3224 }
3225
3226 // Optimized code for PNG Sub filter decoder
3227 void /* PRIVATE */
3228 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3229 {
3230 //int test;
3231 int bpp;
3232 png_uint_32 FullLength;
3233 png_uint_32 MMXLength;
3234 int diff;
3235
3236 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3237 FullLength = row_info->rowbytes - bpp; // # of bytes to filter
3238 _asm {
3239 mov edi, row
3240 mov esi, edi // lp = row
3241 add edi, bpp // rp = row + bpp
3242 xor eax, eax
3243 // get # of bytes to alignment
3244 mov diff, edi // take start of row
3245 add diff, 0xf // add 7 + 8 to incr past
3246 // alignment boundary
3247 xor ebx, ebx
3248 and diff, 0xfffffff8 // mask to alignment boundary
3249 sub diff, edi // subtract from start ==> value
3250 // ebx at alignment
3251 jz dsubgo
3252 // fix alignment
3253 dsublp1:
3254 mov al, [esi+ebx]
3255 add [edi+ebx], al
3256 inc ebx
3257 cmp ebx, diff
3258 jb dsublp1
3259 dsubgo:
3260 mov ecx, FullLength
3261 mov edx, ecx
3262 sub edx, ebx // subtract alignment fix
3263 and edx, 0x00000007 // calc bytes over mult of 8
3264 sub ecx, edx // drop over bytes from length
3265 mov MMXLength, ecx
3266 } // end _asm block
3267
3268 // Now do the math for the rest of the row
3269 switch ( bpp )
3270 {
3271 case 3:
3272 {
3273 ActiveMask.use = 0x0000ffffff000000;
3274 ShiftBpp.use = 24; // == 3 * 8
3275 ShiftRem.use = 40; // == 64 - 24
3276 _asm {
3277 mov edi, row
3278 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3279 mov esi, edi // lp = row
3280 add edi, bpp // rp = row + bpp
3281 movq mm6, mm7
3282 mov ebx, diff
3283 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3284 // byte group
3285 // PRIME the pump (load the first Raw(x-bpp) data set
3286 movq mm1, [edi+ebx-8]
3287 dsub3lp:
3288 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3289 // no need for mask; shift clears inactive bytes
3290 // Add 1st active group
3291 movq mm0, [edi+ebx]
3292 paddb mm0, mm1
3293 // Add 2nd active group
3294 movq mm1, mm0 // mov updated Raws to mm1
3295 psllq mm1, ShiftBpp // shift data to position correctly
3296 pand mm1, mm7 // mask to use only 2nd active group
3297 paddb mm0, mm1
3298 // Add 3rd active group
3299 movq mm1, mm0 // mov updated Raws to mm1
3300 psllq mm1, ShiftBpp // shift data to position correctly
3301 pand mm1, mm6 // mask to use only 3rd active group
3302 add ebx, 8
3303 paddb mm0, mm1
3304 cmp ebx, MMXLength
3305 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3306 // Prep for doing 1st add at top of loop
3307 movq mm1, mm0
3308 jb dsub3lp
3309 } // end _asm block
3310 }
3311 break;
3312
3313 case 1:
3314 {
3315 // Placed here just in case this is a duplicate of the
3316 // non-MMX code for the SUB filter in png_read_filter_row below
3317 //
3318 // png_bytep rp;
3319 // png_bytep lp;
3320 // png_uint_32 i;
3321 // bpp = (row_info->pixel_depth + 7) >> 3;
3322 // for (i = (png_uint_32)bpp, rp = row + bpp, lp = row;
3323 // i < row_info->rowbytes; i++, rp++, lp++)
3324 // {
3325 // *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff);
3326 // }
3327 _asm {
3328 mov ebx, diff
3329 mov edi, row
3330 cmp ebx, FullLength
3331 jnb dsub1end
3332 mov esi, edi // lp = row
3333 xor eax, eax
3334 add edi, bpp // rp = row + bpp
3335 dsub1lp:
3336 mov al, [esi+ebx]
3337 add [edi+ebx], al
3338 inc ebx
3339 cmp ebx, FullLength
3340 jb dsub1lp
3341 dsub1end:
3342 } // end _asm block
3343 }
3344 return;
3345
3346 case 6:
3347 case 7:
3348 case 4:
3349 case 5:
3350 {
3351 ShiftBpp.use = bpp << 3;
3352 ShiftRem.use = 64 - ShiftBpp.use;
3353 _asm {
3354 mov edi, row
3355 mov ebx, diff
3356 mov esi, edi // lp = row
3357 add edi, bpp // rp = row + bpp
3358 // PRIME the pump (load the first Raw(x-bpp) data set
3359 movq mm1, [edi+ebx-8]
3360 dsub4lp:
3361 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3362 // no need for mask; shift clears inactive bytes
3363 movq mm0, [edi+ebx]
3364 paddb mm0, mm1
3365 // Add 2nd active group
3366 movq mm1, mm0 // mov updated Raws to mm1
3367 psllq mm1, ShiftBpp // shift data to position correctly
3368 // there is no need for any mask
3369 // since shift clears inactive bits/bytes
3370 add ebx, 8
3371 paddb mm0, mm1
3372 cmp ebx, MMXLength
3373 movq [edi+ebx-8], mm0
3374 movq mm1, mm0 // Prep for doing 1st add at top of loop
3375 jb dsub4lp
3376 } // end _asm block
3377 }
3378 break;
3379
3380 case 2:
3381 {
3382 ActiveMask.use = 0x00000000ffff0000;
3383 ShiftBpp.use = 16; // == 2 * 8
3384 ShiftRem.use = 48; // == 64 - 16
3385 _asm {
3386 movq mm7, ActiveMask // Load ActiveMask for 2nd active byte group
3387 mov ebx, diff
3388 movq mm6, mm7
3389 mov edi, row
3390 psllq mm6, ShiftBpp // Move mask in mm6 to cover 3rd active
3391 // byte group
3392 mov esi, edi // lp = row
3393 movq mm5, mm6
3394 add edi, bpp // rp = row + bpp
3395 psllq mm5, ShiftBpp // Move mask in mm5 to cover 4th active
3396 // byte group
3397 // PRIME the pump (load the first Raw(x-bpp) data set
3398 movq mm1, [edi+ebx-8]
3399 dsub2lp:
3400 // Add 1st active group
3401 psrlq mm1, ShiftRem // Shift data for adding 1st bpp bytes
3402 // no need for mask; shift clears inactive
3403 // bytes
3404 movq mm0, [edi+ebx]
3405 paddb mm0, mm1
3406 // Add 2nd active group
3407 movq mm1, mm0 // mov updated Raws to mm1
3408 psllq mm1, ShiftBpp // shift data to position correctly
3409 pand mm1, mm7 // mask to use only 2nd active group
3410 paddb mm0, mm1
3411 // Add 3rd active group
3412 movq mm1, mm0 // mov updated Raws to mm1
3413 psllq mm1, ShiftBpp // shift data to position correctly
3414 pand mm1, mm6 // mask to use only 3rd active group
3415 paddb mm0, mm1
3416 // Add 4th active group
3417 movq mm1, mm0 // mov updated Raws to mm1
3418 psllq mm1, ShiftBpp // shift data to position correctly
3419 pand mm1, mm5 // mask to use only 4th active group
3420 add ebx, 8
3421 paddb mm0, mm1
3422 cmp ebx, MMXLength
3423 movq [edi+ebx-8], mm0 // Write updated Raws back to array
3424 movq mm1, mm0 // Prep for doing 1st add at top of loop
3425 jb dsub2lp
3426 } // end _asm block
3427 }
3428 break;
3429 case 8:
3430 {
3431 _asm {
3432 mov edi, row
3433 mov ebx, diff
3434 mov esi, edi // lp = row
3435 add edi, bpp // rp = row + bpp
3436 mov ecx, MMXLength
3437 movq mm7, [edi+ebx-8] // PRIME the pump (load the first
3438 // Raw(x-bpp) data set
3439 and ecx, 0x0000003f // calc bytes over mult of 64
3440 dsub8lp:
3441 movq mm0, [edi+ebx] // Load Sub(x) for 1st 8 bytes
3442 paddb mm0, mm7
3443 movq mm1, [edi+ebx+8] // Load Sub(x) for 2nd 8 bytes
3444 movq [edi+ebx], mm0 // Write Raw(x) for 1st 8 bytes
3445 // Now mm0 will be used as Raw(x-bpp) for
3446 // the 2nd group of 8 bytes. This will be
3447 // repeated for each group of 8 bytes with
3448 // the 8th group being used as the Raw(x-bpp)
3449 // for the 1st group of the next loop.
3450 paddb mm1, mm0
3451 movq mm2, [edi+ebx+16] // Load Sub(x) for 3rd 8 bytes
3452 movq [edi+ebx+8], mm1 // Write Raw(x) for 2nd 8 bytes
3453 paddb mm2, mm1
3454 movq mm3, [edi+ebx+24] // Load Sub(x) for 4th 8 bytes
3455 movq [edi+ebx+16], mm2 // Write Raw(x) for 3rd 8 bytes
3456 paddb mm3, mm2
3457 movq mm4, [edi+ebx+32] // Load Sub(x) for 5th 8 bytes
3458 movq [edi+ebx+24], mm3 // Write Raw(x) for 4th 8 bytes
3459 paddb mm4, mm3
3460 movq mm5, [edi+ebx+40] // Load Sub(x) for 6th 8 bytes
3461 movq [edi+ebx+32], mm4 // Write Raw(x) for 5th 8 bytes
3462 paddb mm5, mm4
3463 movq mm6, [edi+ebx+48] // Load Sub(x) for 7th 8 bytes
3464 movq [edi+ebx+40], mm5 // Write Raw(x) for 6th 8 bytes
3465 paddb mm6, mm5
3466 movq mm7, [edi+ebx+56] // Load Sub(x) for 8th 8 bytes
3467 movq [edi+ebx+48], mm6 // Write Raw(x) for 7th 8 bytes
3468 add ebx, 64
3469 paddb mm7, mm6
3470 cmp ebx, ecx
3471 movq [edi+ebx-8], mm7 // Write Raw(x) for 8th 8 bytes
3472 jb dsub8lp
3473 cmp ebx, MMXLength
3474 jnb dsub8lt8
3475 dsub8lpA:
3476 movq mm0, [edi+ebx]
3477 add ebx, 8
3478 paddb mm0, mm7
3479 cmp ebx, MMXLength
3480 movq [edi+ebx-8], mm0 // use -8 to offset early add to ebx
3481 movq mm7, mm0 // Move calculated Raw(x) data to mm1 to
3482 // be the new Raw(x-bpp) for the next loop
3483 jb dsub8lpA
3484 dsub8lt8:
3485 } // end _asm block
3486 }
3487 break;
3488
3489 default: // bpp greater than 8 bytes
3490 {
3491 _asm {
3492 mov ebx, diff
3493 mov edi, row
3494 mov esi, edi // lp = row
3495 add edi, bpp // rp = row + bpp
3496 dsubAlp:
3497 movq mm0, [edi+ebx]
3498 movq mm1, [esi+ebx]
3499 add ebx, 8
3500 paddb mm0, mm1
3501 cmp ebx, MMXLength
3502 movq [edi+ebx-8], mm0 // mov does not affect flags; -8 to offset
3503 // add ebx
3504 jb dsubAlp
3505 } // end _asm block
3506 }
3507 break;
3508
3509 } // end switch ( bpp )
3510
3511 _asm {
3512 mov ebx, MMXLength
3513 mov edi, row
3514 cmp ebx, FullLength
3515 jnb dsubend
3516 mov esi, edi // lp = row
3517 xor eax, eax
3518 add edi, bpp // rp = row + bpp
3519 dsublp2:
3520 mov al, [esi+ebx]
3521 add [edi+ebx], al
3522 inc ebx
3523 cmp ebx, FullLength
3524 jb dsublp2
3525 dsubend:
3526 emms // End MMX instructions; prep for possible FP instrs.
3527 } // end _asm block
3528 }
3529
3530 // Optimized code for PNG Up filter decoder
3531 void /* PRIVATE */
3532 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3533 png_bytep prev_row)
3534 {
3535 png_uint_32 len;
3536 len = row_info->rowbytes; // # of bytes to filter
3537 _asm {
3538 mov edi, row
3539 // get # of bytes to alignment
3540 mov ecx, edi
3541 xor ebx, ebx
3542 add ecx, 0x7
3543 xor eax, eax
3544 and ecx, 0xfffffff8
3545 mov esi, prev_row
3546 sub ecx, edi
3547 jz dupgo
3548 // fix alignment
3549 duplp1:
3550 mov al, [edi+ebx]
3551 add al, [esi+ebx]
3552 inc ebx
3553 cmp ebx, ecx
3554 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3555 jb duplp1
3556 dupgo:
3557 mov ecx, len
3558 mov edx, ecx
3559 sub edx, ebx // subtract alignment fix
3560 and edx, 0x0000003f // calc bytes over mult of 64
3561 sub ecx, edx // drop over bytes from length
3562 // Unrolled loop - use all MMX registers and interleave to reduce
3563 // number of branch instructions (loops) and reduce partial stalls
3564 duploop:
3565 movq mm1, [esi+ebx]
3566 movq mm0, [edi+ebx]
3567 movq mm3, [esi+ebx+8]
3568 paddb mm0, mm1
3569 movq mm2, [edi+ebx+8]
3570 movq [edi+ebx], mm0
3571 paddb mm2, mm3
3572 movq mm5, [esi+ebx+16]
3573 movq [edi+ebx+8], mm2
3574 movq mm4, [edi+ebx+16]
3575 movq mm7, [esi+ebx+24]
3576 paddb mm4, mm5
3577 movq mm6, [edi+ebx+24]
3578 movq [edi+ebx+16], mm4
3579 paddb mm6, mm7
3580 movq mm1, [esi+ebx+32]
3581 movq [edi+ebx+24], mm6
3582 movq mm0, [edi+ebx+32]
3583 movq mm3, [esi+ebx+40]
3584 paddb mm0, mm1
3585 movq mm2, [edi+ebx+40]
3586 movq [edi+ebx+32], mm0
3587 paddb mm2, mm3
3588 movq mm5, [esi+ebx+48]
3589 movq [edi+ebx+40], mm2
3590 movq mm4, [edi+ebx+48]
3591 movq mm7, [esi+ebx+56]
3592 paddb mm4, mm5
3593 movq mm6, [edi+ebx+56]
3594 movq [edi+ebx+48], mm4
3595 add ebx, 64
3596 paddb mm6, mm7
3597 cmp ebx, ecx
3598 movq [edi+ebx-8], mm6 // (+56)movq does not affect flags;
3599 // -8 to offset add ebx
3600 jb duploop
3601
3602 cmp edx, 0 // Test for bytes over mult of 64
3603 jz dupend
3604
3605
3606 // 2 lines added by lcreeve@netins.net
3607 // (mail 11 Jul 98 in png-implement list)
3608 cmp edx, 8 //test for less than 8 bytes
3609 jb duplt8
3610
3611
3612 add ecx, edx
3613 and edx, 0x00000007 // calc bytes over mult of 8
3614 sub ecx, edx // drop over bytes from length
3615 jz duplt8
3616 // Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously
3617 duplpA:
3618 movq mm1, [esi+ebx]
3619 movq mm0, [edi+ebx]
3620 add ebx, 8
3621 paddb mm0, mm1
3622 cmp ebx, ecx
3623 movq [edi+ebx-8], mm0 // movq does not affect flags; -8 to offset add ebx
3624 jb duplpA
3625 cmp edx, 0 // Test for bytes over mult of 8
3626 jz dupend
3627 duplt8:
3628 xor eax, eax
3629 add ecx, edx // move over byte count into counter
3630 // Loop using x86 registers to update remaining bytes
3631 duplp2:
3632 mov al, [edi + ebx]
3633 add al, [esi + ebx]
3634 inc ebx
3635 cmp ebx, ecx
3636 mov [edi + ebx-1], al // mov does not affect flags; -1 to offset inc ebx
3637 jb duplp2
3638 dupend:
3639 // Conversion of filtered row completed
3640 emms // End MMX instructions; prep for possible FP instrs.
3641 } // end _asm block
3642 }
3643
3644
3645 // Optimized png_read_filter_row routines
3646 void /* PRIVATE */
3647 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3648 row, png_bytep prev_row, int filter)
3649 {
3650 #ifdef PNG_DEBUG
3651 char filnm[10];
3652 #endif
3653
3654 if (mmx_supported == 2) {
3655 /* this should have happened in png_init_mmx_flags() already */
3656 png_warning(png_ptr, "asm_flags may not have been initialized");
3657 png_mmx_support();
3658 }
3659
3660 #ifdef PNG_DEBUG
3661 png_debug(1, "in png_read_filter_row\n");
3662 switch (filter)
3663 {
3664 case 0: sprintf(filnm, "none");
3665 break;
3666 case 1: sprintf(filnm, "sub-%s",
3667 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3668 break;
3669 case 2: sprintf(filnm, "up-%s",
3670 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3671 break;
3672 case 3: sprintf(filnm, "avg-%s",
3673 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3674 break;
3675 case 4: sprintf(filnm, "Paeth-%s",
3676 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3677 break;
3678 default: sprintf(filnm, "unknw");
3679 break;
3680 }
3681 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3682 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3683 (int)((row_info->pixel_depth + 7) >> 3));
3684 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3685 #endif /* PNG_DEBUG */
3686
3687 switch (filter)
3688 {
3689 case PNG_FILTER_VALUE_NONE:
3690 break;
3691
3692 case PNG_FILTER_VALUE_SUB:
3693 {
3694 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3695 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3696 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3697 {
3698 png_read_filter_row_mmx_sub(row_info, row);
3699 }
3700 else
3701 {
3702 png_uint_32 i;
3703 png_uint_32 istop = row_info->rowbytes;
3704 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3705 png_bytep rp = row + bpp;
3706 png_bytep lp = row;
3707
3708 for (i = bpp; i < istop; i++)
3709 {
3710 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3711 rp++;
3712 }
3713 }
3714 break;
3715 }
3716
3717 case PNG_FILTER_VALUE_UP:
3718 {
3719 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3720 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3721 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3722 {
3723 png_read_filter_row_mmx_up(row_info, row, prev_row);
3724 }
3725 else
3726 {
3727 png_uint_32 i;
3728 png_uint_32 istop = row_info->rowbytes;
3729 png_bytep rp = row;
3730 png_bytep pp = prev_row;
3731
3732 for (i = 0; i < istop; ++i)
3733 {
3734 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3735 rp++;
3736 }
3737 }
3738 break;
3739 }
3740
3741 case PNG_FILTER_VALUE_AVG:
3742 {
3743 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3744 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3745 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3746 {
3747 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3748 }
3749 else
3750 {
3751 png_uint_32 i;
3752 png_bytep rp = row;
3753 png_bytep pp = prev_row;
3754 png_bytep lp = row;
3755 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3756 png_uint_32 istop = row_info->rowbytes - bpp;
3757
3758 for (i = 0; i < bpp; i++)
3759 {
3760 *rp = (png_byte)(((int)(*rp) +
3761 ((int)(*pp++) >> 1)) & 0xff);
3762 rp++;
3763 }
3764
3765 for (i = 0; i < istop; i++)
3766 {
3767 *rp = (png_byte)(((int)(*rp) +
3768 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3769 rp++;
3770 }
3771 }
3772 break;
3773 }
3774
3775 case PNG_FILTER_VALUE_PAETH:
3776 {
3777 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3778 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3779 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3780 {
3781 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3782 }
3783 else
3784 {
3785 png_uint_32 i;
3786 png_bytep rp = row;
3787 png_bytep pp = prev_row;
3788 png_bytep lp = row;
3789 png_bytep cp = prev_row;
3790 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3791 png_uint_32 istop=row_info->rowbytes - bpp;
3792
3793 for (i = 0; i < bpp; i++)
3794 {
3795 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3796 rp++;
3797 }
3798
3799 for (i = 0; i < istop; i++) // use leftover rp,pp
3800 {
3801 int a, b, c, pa, pb, pc, p;
3802
3803 a = *lp++;
3804 b = *pp++;
3805 c = *cp++;
3806
3807 p = b - c;
3808 pc = a - c;
3809
3810 #ifdef PNG_USE_ABS
3811 pa = abs(p);
3812 pb = abs(pc);
3813 pc = abs(p + pc);
3814 #else
3815 pa = p < 0 ? -p : p;
3816 pb = pc < 0 ? -pc : pc;
3817 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3818 #endif
3819
3820 /*
3821 if (pa <= pb && pa <= pc)
3822 p = a;
3823 else if (pb <= pc)
3824 p = b;
3825 else
3826 p = c;
3827 */
3828
3829 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3830
3831 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3832 rp++;
3833 }
3834 }
3835 break;
3836 }
3837
3838 default:
3839 png_warning(png_ptr, "Ignoring bad row filter type");
3840 *row=0;
3841 break;
3842 }
3843 }
3844
3845 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */