removing OmnisRC flags
[wxWidgets.git] / src / png / pngvcrd.c
1 /* pngvcrd.c - mixed C/assembler version of utilities to read a PNG file
2 *
3 * For Intel x86 CPU and Microsoft Visual C++ compiler
4 *
5 * libpng version 1.2.7 - September 12, 2004
6 * For conditions of distribution and use, see copyright notice in png.h
7 * Copyright (c) 1998-2004 Glenn Randers-Pehrson
8 * Copyright (c) 1998, Intel Corporation
9 *
10 * Contributed by Nirav Chhatrapati, Intel Corporation, 1998
11 * Interface to libpng contributed by Gilles Vollant, 1999
12 *
13 *
14 * In png_do_read_interlace() in libpng versions 1.0.3a through 1.0.4d,
15 * a sign error in the post-MMX cleanup code for each pixel_depth resulted
16 * in bad pixels at the beginning of some rows of some images, and also
17 * (due to out-of-range memory reads and writes) caused heap corruption
18 * when compiled with MSVC 6.0. The error was fixed in version 1.0.4e.
19 *
20 * [png_read_filter_row_mmx_avg() bpp == 2 bugfix, GRR 20000916]
21 *
22 * [runtime MMX configuration, GRR 20010102]
23 *
24 */
25
26 #define PNG_INTERNAL
27 #include "png.h"
28
29 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGVCRD)
30
31 static int mmx_supported=2;
32
33
34 int PNGAPI
35 png_mmx_support(void)
36 {
37 int mmx_supported_local = 0;
38 _asm {
39 push ebx /*CPUID will trash these */
40 push ecx
41 push edx
42
43 pushfd /*Save Eflag to stack */
44 pop eax /*Get Eflag from stack into eax */
45 mov ecx, eax /*Make another copy of Eflag in ecx */
46 xor eax, 0x200000 /*Toggle ID bit in Eflag [i.e. bit(21)] */
47 push eax /*Save modified Eflag back to stack */
48
49 popfd /*Restored modified value back to Eflag reg */
50 pushfd /*Save Eflag to stack */
51 pop eax /*Get Eflag from stack */
52 push ecx /* save original Eflag to stack */
53 popfd /* restore original Eflag */
54 xor eax, ecx /*Compare the new Eflag with the original Eflag */
55 jz NOT_SUPPORTED /*If the same, CPUID instruction is not supported, */
56 /*skip following instructions and jump to */
57 /*NOT_SUPPORTED label */
58
59 xor eax, eax /*Set eax to zero */
60
61 _asm _emit 0x0f /*CPUID instruction (two bytes opcode) */
62 _asm _emit 0xa2
63
64 cmp eax, 1 /*make sure eax return non-zero value */
65 jl NOT_SUPPORTED /*If eax is zero, mmx not supported */
66
67 xor eax, eax /*set eax to zero */
68 inc eax /*Now increment eax to 1. This instruction is */
69 /*faster than the instruction "mov eax, 1" */
70
71 _asm _emit 0x0f /*CPUID instruction */
72 _asm _emit 0xa2
73
74 and edx, 0x00800000 /*mask out all bits but mmx bit(24) */
75 cmp edx, 0 /* 0 = mmx not supported */
76 jz NOT_SUPPORTED /* non-zero = Yes, mmx IS supported */
77
78 mov mmx_supported_local, 1 /*set return value to 1 */
79
80 NOT_SUPPORTED:
81 mov eax, mmx_supported_local /*move return value to eax */
82 pop edx /*CPUID trashed these */
83 pop ecx
84 pop ebx
85 }
86
87 /*mmx_supported_local=0; // test code for force don't support MMX */
88 /*printf("MMX : %u (1=MMX supported)\n",mmx_supported_local); */
89
90 mmx_supported = mmx_supported_local;
91 return mmx_supported_local;
92 }
93
94 /* Combines the row recently read in with the previous row.
95 This routine takes care of alpha and transparency if requested.
96 This routine also handles the two methods of progressive display
97 of interlaced images, depending on the mask value.
98 The mask value describes which pixels are to be combined with
99 the row. The pattern always repeats every 8 pixels, so just 8
100 bits are needed. A one indicates the pixel is to be combined; a
101 zero indicates the pixel is to be skipped. This is in addition
102 to any alpha or transparency value associated with the pixel. If
103 you want all pixels to be combined, pass 0xff (255) in mask. */
104
105 /* Use this routine for x86 platform - uses faster MMX routine if machine
106 supports MMX */
107
108 void /* PRIVATE */
109 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
110 {
111 #ifdef PNG_USE_LOCAL_ARRAYS
112 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
113 #endif
114
115 png_debug(1,"in png_combine_row_asm\n");
116
117 if (mmx_supported == 2) {
118 #if !defined(PNG_1_0_X)
119 /* this should have happened in png_init_mmx_flags() already */
120 png_warning(png_ptr, "asm_flags may not have been initialized");
121 #endif
122 png_mmx_support();
123 }
124
125 if (mask == 0xff)
126 {
127 png_memcpy(row, png_ptr->row_buf + 1,
128 (png_size_t)PNG_ROWBYTES(png_ptr->row_info.pixel_depth,
129 png_ptr->width));
130 }
131 /* GRR: add "else if (mask == 0)" case?
132 * or does png_combine_row() not even get called in that case? */
133 else
134 {
135 switch (png_ptr->row_info.pixel_depth)
136 {
137 case 1:
138 {
139 png_bytep sp;
140 png_bytep dp;
141 int s_inc, s_start, s_end;
142 int m;
143 int shift;
144 png_uint_32 i;
145
146 sp = png_ptr->row_buf + 1;
147 dp = row;
148 m = 0x80;
149 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
150 if (png_ptr->transformations & PNG_PACKSWAP)
151 {
152 s_start = 0;
153 s_end = 7;
154 s_inc = 1;
155 }
156 else
157 #endif
158 {
159 s_start = 7;
160 s_end = 0;
161 s_inc = -1;
162 }
163
164 shift = s_start;
165
166 for (i = 0; i < png_ptr->width; i++)
167 {
168 if (m & mask)
169 {
170 int value;
171
172 value = (*sp >> shift) & 0x1;
173 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
174 *dp |= (png_byte)(value << shift);
175 }
176
177 if (shift == s_end)
178 {
179 shift = s_start;
180 sp++;
181 dp++;
182 }
183 else
184 shift += s_inc;
185
186 if (m == 1)
187 m = 0x80;
188 else
189 m >>= 1;
190 }
191 break;
192 }
193
194 case 2:
195 {
196 png_bytep sp;
197 png_bytep dp;
198 int s_start, s_end, s_inc;
199 int m;
200 int shift;
201 png_uint_32 i;
202 int value;
203
204 sp = png_ptr->row_buf + 1;
205 dp = row;
206 m = 0x80;
207 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
208 if (png_ptr->transformations & PNG_PACKSWAP)
209 {
210 s_start = 0;
211 s_end = 6;
212 s_inc = 2;
213 }
214 else
215 #endif
216 {
217 s_start = 6;
218 s_end = 0;
219 s_inc = -2;
220 }
221
222 shift = s_start;
223
224 for (i = 0; i < png_ptr->width; i++)
225 {
226 if (m & mask)
227 {
228 value = (*sp >> shift) & 0x3;
229 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
230 *dp |= (png_byte)(value << shift);
231 }
232
233 if (shift == s_end)
234 {
235 shift = s_start;
236 sp++;
237 dp++;
238 }
239 else
240 shift += s_inc;
241 if (m == 1)
242 m = 0x80;
243 else
244 m >>= 1;
245 }
246 break;
247 }
248
249 case 4:
250 {
251 png_bytep sp;
252 png_bytep dp;
253 int s_start, s_end, s_inc;
254 int m;
255 int shift;
256 png_uint_32 i;
257 int value;
258
259 sp = png_ptr->row_buf + 1;
260 dp = row;
261 m = 0x80;
262 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
263 if (png_ptr->transformations & PNG_PACKSWAP)
264 {
265 s_start = 0;
266 s_end = 4;
267 s_inc = 4;
268 }
269 else
270 #endif
271 {
272 s_start = 4;
273 s_end = 0;
274 s_inc = -4;
275 }
276 shift = s_start;
277
278 for (i = 0; i < png_ptr->width; i++)
279 {
280 if (m & mask)
281 {
282 value = (*sp >> shift) & 0xf;
283 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
284 *dp |= (png_byte)(value << shift);
285 }
286
287 if (shift == s_end)
288 {
289 shift = s_start;
290 sp++;
291 dp++;
292 }
293 else
294 shift += s_inc;
295 if (m == 1)
296 m = 0x80;
297 else
298 m >>= 1;
299 }
300 break;
301 }
302
303 case 8:
304 {
305 png_bytep srcptr;
306 png_bytep dstptr;
307 png_uint_32 len;
308 int m;
309 int diff, unmask;
310
311 __int64 mask0=0x0102040810204080;
312
313 #if !defined(PNG_1_0_X)
314 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
315 /* && mmx_supported */ )
316 #else
317 if (mmx_supported)
318 #endif
319 {
320 srcptr = png_ptr->row_buf + 1;
321 dstptr = row;
322 m = 0x80;
323 unmask = ~mask;
324 len = png_ptr->width &~7; /*reduce to multiple of 8 */
325 diff = png_ptr->width & 7; /*amount lost */
326
327 _asm
328 {
329 movd mm7, unmask /*load bit pattern */
330 psubb mm6,mm6 /*zero mm6 */
331 punpcklbw mm7,mm7
332 punpcklwd mm7,mm7
333 punpckldq mm7,mm7 /*fill register with 8 masks */
334
335 movq mm0,mask0
336
337 pand mm0,mm7 /*nonzero if keep byte */
338 pcmpeqb mm0,mm6 /*zeros->1s, v versa */
339
340 mov ecx,len /*load length of line (pixels) */
341 mov esi,srcptr /*load source */
342 mov ebx,dstptr /*load dest */
343 cmp ecx,0 /*lcr */
344 je mainloop8end
345
346 mainloop8:
347 movq mm4,[esi]
348 pand mm4,mm0
349 movq mm6,mm0
350 pandn mm6,[ebx]
351 por mm4,mm6
352 movq [ebx],mm4
353
354 add esi,8 /*inc by 8 bytes processed */
355 add ebx,8
356 sub ecx,8 /*dec by 8 pixels processed */
357
358 ja mainloop8
359 mainloop8end:
360
361 mov ecx,diff
362 cmp ecx,0
363 jz end8
364
365 mov edx,mask
366 sal edx,24 /*make low byte the high byte */
367
368 secondloop8:
369 sal edx,1 /*move high bit to CF */
370 jnc skip8 /*if CF = 0 */
371 mov al,[esi]
372 mov [ebx],al
373 skip8:
374 inc esi
375 inc ebx
376
377 dec ecx
378 jnz secondloop8
379 end8:
380 emms
381 }
382 }
383 else /* mmx not supported - use modified C routine */
384 {
385 register unsigned int incr1, initial_val, final_val;
386 png_size_t pixel_bytes;
387 png_uint_32 i;
388 register int disp = png_pass_inc[png_ptr->pass];
389 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
390
391 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
392 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
393 pixel_bytes;
394 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
395 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
396 final_val = png_ptr->width*pixel_bytes;
397 incr1 = (disp)*pixel_bytes;
398 for (i = initial_val; i < final_val; i += incr1)
399 {
400 png_memcpy(dstptr, srcptr, pixel_bytes);
401 srcptr += incr1;
402 dstptr += incr1;
403 }
404 } /* end of else */
405
406 break;
407 } /* end 8 bpp */
408
409 case 16:
410 {
411 png_bytep srcptr;
412 png_bytep dstptr;
413 png_uint_32 len;
414 int unmask, diff;
415 __int64 mask1=0x0101020204040808,
416 mask0=0x1010202040408080;
417
418 #if !defined(PNG_1_0_X)
419 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
420 /* && mmx_supported */ )
421 #else
422 if (mmx_supported)
423 #endif
424 {
425 srcptr = png_ptr->row_buf + 1;
426 dstptr = row;
427
428 unmask = ~mask;
429 len = (png_ptr->width)&~7;
430 diff = (png_ptr->width)&7;
431 _asm
432 {
433 movd mm7, unmask /*load bit pattern */
434 psubb mm6,mm6 /*zero mm6 */
435 punpcklbw mm7,mm7
436 punpcklwd mm7,mm7
437 punpckldq mm7,mm7 /*fill register with 8 masks */
438
439 movq mm0,mask0
440 movq mm1,mask1
441
442 pand mm0,mm7
443 pand mm1,mm7
444
445 pcmpeqb mm0,mm6
446 pcmpeqb mm1,mm6
447
448 mov ecx,len /*load length of line */
449 mov esi,srcptr /*load source */
450 mov ebx,dstptr /*load dest */
451 cmp ecx,0 /*lcr */
452 jz mainloop16end
453
454 mainloop16:
455 movq mm4,[esi]
456 pand mm4,mm0
457 movq mm6,mm0
458 movq mm7,[ebx]
459 pandn mm6,mm7
460 por mm4,mm6
461 movq [ebx],mm4
462
463 movq mm5,[esi+8]
464 pand mm5,mm1
465 movq mm7,mm1
466 movq mm6,[ebx+8]
467 pandn mm7,mm6
468 por mm5,mm7
469 movq [ebx+8],mm5
470
471 add esi,16 /*inc by 16 bytes processed */
472 add ebx,16
473 sub ecx,8 /*dec by 8 pixels processed */
474
475 ja mainloop16
476
477 mainloop16end:
478 mov ecx,diff
479 cmp ecx,0
480 jz end16
481
482 mov edx,mask
483 sal edx,24 /*make low byte the high byte */
484 secondloop16:
485 sal edx,1 /*move high bit to CF */
486 jnc skip16 /*if CF = 0 */
487 mov ax,[esi]
488 mov [ebx],ax
489 skip16:
490 add esi,2
491 add ebx,2
492
493 dec ecx
494 jnz secondloop16
495 end16:
496 emms
497 }
498 }
499 else /* mmx not supported - use modified C routine */
500 {
501 register unsigned int incr1, initial_val, final_val;
502 png_size_t pixel_bytes;
503 png_uint_32 i;
504 register int disp = png_pass_inc[png_ptr->pass];
505 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
506
507 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
508 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
509 pixel_bytes;
510 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
511 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
512 final_val = png_ptr->width*pixel_bytes;
513 incr1 = (disp)*pixel_bytes;
514 for (i = initial_val; i < final_val; i += incr1)
515 {
516 png_memcpy(dstptr, srcptr, pixel_bytes);
517 srcptr += incr1;
518 dstptr += incr1;
519 }
520 } /* end of else */
521
522 break;
523 } /* end 16 bpp */
524
525 case 24:
526 {
527 png_bytep srcptr;
528 png_bytep dstptr;
529 png_uint_32 len;
530 int unmask, diff;
531
532 __int64 mask2=0x0101010202020404, /*24bpp */
533 mask1=0x0408080810101020,
534 mask0=0x2020404040808080;
535
536 srcptr = png_ptr->row_buf + 1;
537 dstptr = row;
538
539 unmask = ~mask;
540 len = (png_ptr->width)&~7;
541 diff = (png_ptr->width)&7;
542
543 #if !defined(PNG_1_0_X)
544 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
545 /* && mmx_supported */ )
546 #else
547 if (mmx_supported)
548 #endif
549 {
550 _asm
551 {
552 movd mm7, unmask /*load bit pattern */
553 psubb mm6,mm6 /*zero mm6 */
554 punpcklbw mm7,mm7
555 punpcklwd mm7,mm7
556 punpckldq mm7,mm7 /*fill register with 8 masks */
557
558 movq mm0,mask0
559 movq mm1,mask1
560 movq mm2,mask2
561
562 pand mm0,mm7
563 pand mm1,mm7
564 pand mm2,mm7
565
566 pcmpeqb mm0,mm6
567 pcmpeqb mm1,mm6
568 pcmpeqb mm2,mm6
569
570 mov ecx,len /*load length of line */
571 mov esi,srcptr /*load source */
572 mov ebx,dstptr /*load dest */
573 cmp ecx,0
574 jz mainloop24end
575
576 mainloop24:
577 movq mm4,[esi]
578 pand mm4,mm0
579 movq mm6,mm0
580 movq mm7,[ebx]
581 pandn mm6,mm7
582 por mm4,mm6
583 movq [ebx],mm4
584
585
586 movq mm5,[esi+8]
587 pand mm5,mm1
588 movq mm7,mm1
589 movq mm6,[ebx+8]
590 pandn mm7,mm6
591 por mm5,mm7
592 movq [ebx+8],mm5
593
594 movq mm6,[esi+16]
595 pand mm6,mm2
596 movq mm4,mm2
597 movq mm7,[ebx+16]
598 pandn mm4,mm7
599 por mm6,mm4
600 movq [ebx+16],mm6
601
602 add esi,24 /*inc by 24 bytes processed */
603 add ebx,24
604 sub ecx,8 /*dec by 8 pixels processed */
605
606 ja mainloop24
607
608 mainloop24end:
609 mov ecx,diff
610 cmp ecx,0
611 jz end24
612
613 mov edx,mask
614 sal edx,24 /*make low byte the high byte */
615 secondloop24:
616 sal edx,1 /*move high bit to CF */
617 jnc skip24 /*if CF = 0 */
618 mov ax,[esi]
619 mov [ebx],ax
620 xor eax,eax
621 mov al,[esi+2]
622 mov [ebx+2],al
623 skip24:
624 add esi,3
625 add ebx,3
626
627 dec ecx
628 jnz secondloop24
629
630 end24:
631 emms
632 }
633 }
634 else /* mmx not supported - use modified C routine */
635 {
636 register unsigned int incr1, initial_val, final_val;
637 png_size_t pixel_bytes;
638 png_uint_32 i;
639 register int disp = png_pass_inc[png_ptr->pass];
640 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
641
642 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
643 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
644 pixel_bytes;
645 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
646 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
647 final_val = png_ptr->width*pixel_bytes;
648 incr1 = (disp)*pixel_bytes;
649 for (i = initial_val; i < final_val; i += incr1)
650 {
651 png_memcpy(dstptr, srcptr, pixel_bytes);
652 srcptr += incr1;
653 dstptr += incr1;
654 }
655 } /* end of else */
656
657 break;
658 } /* end 24 bpp */
659
660 case 32:
661 {
662 png_bytep srcptr;
663 png_bytep dstptr;
664 png_uint_32 len;
665 int unmask, diff;
666
667 __int64 mask3=0x0101010102020202, /*32bpp */
668 mask2=0x0404040408080808,
669 mask1=0x1010101020202020,
670 mask0=0x4040404080808080;
671
672 srcptr = png_ptr->row_buf + 1;
673 dstptr = row;
674
675 unmask = ~mask;
676 len = (png_ptr->width)&~7;
677 diff = (png_ptr->width)&7;
678
679 #if !defined(PNG_1_0_X)
680 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
681 /* && mmx_supported */ )
682 #else
683 if (mmx_supported)
684 #endif
685 {
686 _asm
687 {
688 movd mm7, unmask /*load bit pattern */
689 psubb mm6,mm6 /*zero mm6 */
690 punpcklbw mm7,mm7
691 punpcklwd mm7,mm7
692 punpckldq mm7,mm7 /*fill register with 8 masks */
693
694 movq mm0,mask0
695 movq mm1,mask1
696 movq mm2,mask2
697 movq mm3,mask3
698
699 pand mm0,mm7
700 pand mm1,mm7
701 pand mm2,mm7
702 pand mm3,mm7
703
704 pcmpeqb mm0,mm6
705 pcmpeqb mm1,mm6
706 pcmpeqb mm2,mm6
707 pcmpeqb mm3,mm6
708
709 mov ecx,len /*load length of line */
710 mov esi,srcptr /*load source */
711 mov ebx,dstptr /*load dest */
712
713 cmp ecx,0 /*lcr */
714 jz mainloop32end
715
716 mainloop32:
717 movq mm4,[esi]
718 pand mm4,mm0
719 movq mm6,mm0
720 movq mm7,[ebx]
721 pandn mm6,mm7
722 por mm4,mm6
723 movq [ebx],mm4
724
725 movq mm5,[esi+8]
726 pand mm5,mm1
727 movq mm7,mm1
728 movq mm6,[ebx+8]
729 pandn mm7,mm6
730 por mm5,mm7
731 movq [ebx+8],mm5
732
733 movq mm6,[esi+16]
734 pand mm6,mm2
735 movq mm4,mm2
736 movq mm7,[ebx+16]
737 pandn mm4,mm7
738 por mm6,mm4
739 movq [ebx+16],mm6
740
741 movq mm7,[esi+24]
742 pand mm7,mm3
743 movq mm5,mm3
744 movq mm4,[ebx+24]
745 pandn mm5,mm4
746 por mm7,mm5
747 movq [ebx+24],mm7
748
749 add esi,32 /*inc by 32 bytes processed */
750 add ebx,32
751 sub ecx,8 /*dec by 8 pixels processed */
752
753 ja mainloop32
754
755 mainloop32end:
756 mov ecx,diff
757 cmp ecx,0
758 jz end32
759
760 mov edx,mask
761 sal edx,24 /*make low byte the high byte */
762 secondloop32:
763 sal edx,1 /*move high bit to CF */
764 jnc skip32 /*if CF = 0 */
765 mov eax,[esi]
766 mov [ebx],eax
767 skip32:
768 add esi,4
769 add ebx,4
770
771 dec ecx
772 jnz secondloop32
773
774 end32:
775 emms
776 }
777 }
778 else /* mmx _not supported - Use modified C routine */
779 {
780 register unsigned int incr1, initial_val, final_val;
781 png_size_t pixel_bytes;
782 png_uint_32 i;
783 register int disp = png_pass_inc[png_ptr->pass];
784 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
785
786 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
787 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
788 pixel_bytes;
789 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
790 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
791 final_val = png_ptr->width*pixel_bytes;
792 incr1 = (disp)*pixel_bytes;
793 for (i = initial_val; i < final_val; i += incr1)
794 {
795 png_memcpy(dstptr, srcptr, pixel_bytes);
796 srcptr += incr1;
797 dstptr += incr1;
798 }
799 } /* end of else */
800
801 break;
802 } /* end 32 bpp */
803
804 case 48:
805 {
806 png_bytep srcptr;
807 png_bytep dstptr;
808 png_uint_32 len;
809 int unmask, diff;
810
811 __int64 mask5=0x0101010101010202,
812 mask4=0x0202020204040404,
813 mask3=0x0404080808080808,
814 mask2=0x1010101010102020,
815 mask1=0x2020202040404040,
816 mask0=0x4040808080808080;
817
818 #if !defined(PNG_1_0_X)
819 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
820 /* && mmx_supported */ )
821 #else
822 if (mmx_supported)
823 #endif
824 {
825 srcptr = png_ptr->row_buf + 1;
826 dstptr = row;
827
828 unmask = ~mask;
829 len = (png_ptr->width)&~7;
830 diff = (png_ptr->width)&7;
831 _asm
832 {
833 movd mm7, unmask /*load bit pattern */
834 psubb mm6,mm6 /*zero mm6 */
835 punpcklbw mm7,mm7
836 punpcklwd mm7,mm7
837 punpckldq mm7,mm7 /*fill register with 8 masks */
838
839 movq mm0,mask0
840 movq mm1,mask1
841 movq mm2,mask2
842 movq mm3,mask3
843 movq mm4,mask4
844 movq mm5,mask5
845
846 pand mm0,mm7
847 pand mm1,mm7
848 pand mm2,mm7
849 pand mm3,mm7
850 pand mm4,mm7
851 pand mm5,mm7
852
853 pcmpeqb mm0,mm6
854 pcmpeqb mm1,mm6
855 pcmpeqb mm2,mm6
856 pcmpeqb mm3,mm6
857 pcmpeqb mm4,mm6
858 pcmpeqb mm5,mm6
859
860 mov ecx,len /*load length of line */
861 mov esi,srcptr /*load source */
862 mov ebx,dstptr /*load dest */
863
864 cmp ecx,0
865 jz mainloop48end
866
867 mainloop48:
868 movq mm7,[esi]
869 pand mm7,mm0
870 movq mm6,mm0
871 pandn mm6,[ebx]
872 por mm7,mm6
873 movq [ebx],mm7
874
875 movq mm6,[esi+8]
876 pand mm6,mm1
877 movq mm7,mm1
878 pandn mm7,[ebx+8]
879 por mm6,mm7
880 movq [ebx+8],mm6
881
882 movq mm6,[esi+16]
883 pand mm6,mm2
884 movq mm7,mm2
885 pandn mm7,[ebx+16]
886 por mm6,mm7
887 movq [ebx+16],mm6
888
889 movq mm7,[esi+24]
890 pand mm7,mm3
891 movq mm6,mm3
892 pandn mm6,[ebx+24]
893 por mm7,mm6
894 movq [ebx+24],mm7
895
896 movq mm6,[esi+32]
897 pand mm6,mm4
898 movq mm7,mm4
899 pandn mm7,[ebx+32]
900 por mm6,mm7
901 movq [ebx+32],mm6
902
903 movq mm7,[esi+40]
904 pand mm7,mm5
905 movq mm6,mm5
906 pandn mm6,[ebx+40]
907 por mm7,mm6
908 movq [ebx+40],mm7
909
910 add esi,48 /*inc by 32 bytes processed */
911 add ebx,48
912 sub ecx,8 /*dec by 8 pixels processed */
913
914 ja mainloop48
915 mainloop48end:
916
917 mov ecx,diff
918 cmp ecx,0
919 jz end48
920
921 mov edx,mask
922 sal edx,24 /*make low byte the high byte */
923
924 secondloop48:
925 sal edx,1 /*move high bit to CF */
926 jnc skip48 /*if CF = 0 */
927 mov eax,[esi]
928 mov [ebx],eax
929 skip48:
930 add esi,4
931 add ebx,4
932
933 dec ecx
934 jnz secondloop48
935
936 end48:
937 emms
938 }
939 }
940 else /* mmx _not supported - Use modified C routine */
941 {
942 register unsigned int incr1, initial_val, final_val;
943 png_size_t pixel_bytes;
944 png_uint_32 i;
945 register int disp = png_pass_inc[png_ptr->pass];
946 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
947
948 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
949 srcptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
950 pixel_bytes;
951 dstptr = row + offset_table[png_ptr->pass]*pixel_bytes;
952 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
953 final_val = png_ptr->width*pixel_bytes;
954 incr1 = (disp)*pixel_bytes;
955 for (i = initial_val; i < final_val; i += incr1)
956 {
957 png_memcpy(dstptr, srcptr, pixel_bytes);
958 srcptr += incr1;
959 dstptr += incr1;
960 }
961 } /* end of else */
962
963 break;
964 } /* end 48 bpp */
965
966 default:
967 {
968 png_bytep sptr;
969 png_bytep dp;
970 png_size_t pixel_bytes;
971 int offset_table[7] = {0, 4, 0, 2, 0, 1, 0};
972 unsigned int i;
973 register int disp = png_pass_inc[png_ptr->pass]; /* get the offset */
974 register unsigned int incr1, initial_val, final_val;
975
976 pixel_bytes = (png_ptr->row_info.pixel_depth >> 3);
977 sptr = png_ptr->row_buf + 1 + offset_table[png_ptr->pass]*
978 pixel_bytes;
979 dp = row + offset_table[png_ptr->pass]*pixel_bytes;
980 initial_val = offset_table[png_ptr->pass]*pixel_bytes;
981 final_val = png_ptr->width*pixel_bytes;
982 incr1 = (disp)*pixel_bytes;
983 for (i = initial_val; i < final_val; i += incr1)
984 {
985 png_memcpy(dp, sptr, pixel_bytes);
986 sptr += incr1;
987 dp += incr1;
988 }
989 break;
990 }
991 } /* end switch (png_ptr->row_info.pixel_depth) */
992 } /* end if (non-trivial mask) */
993
994 } /* end png_combine_row() */
995
996
997 #if defined(PNG_READ_INTERLACING_SUPPORTED)
998
999 void /* PRIVATE */
1000 png_do_read_interlace(png_structp png_ptr)
1001 {
1002 png_row_infop row_info = &(png_ptr->row_info);
1003 png_bytep row = png_ptr->row_buf + 1;
1004 int pass = png_ptr->pass;
1005 png_uint_32 transformations = png_ptr->transformations;
1006 #ifdef PNG_USE_LOCAL_ARRAYS
1007 const int png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
1008 #endif
1009
1010 png_debug(1,"in png_do_read_interlace\n");
1011
1012 if (mmx_supported == 2) {
1013 #if !defined(PNG_1_0_X)
1014 /* this should have happened in png_init_mmx_flags() already */
1015 png_warning(png_ptr, "asm_flags may not have been initialized");
1016 #endif
1017 png_mmx_support();
1018 }
1019
1020 if (row != NULL && row_info != NULL)
1021 {
1022 png_uint_32 final_width;
1023
1024 final_width = row_info->width * png_pass_inc[pass];
1025
1026 switch (row_info->pixel_depth)
1027 {
1028 case 1:
1029 {
1030 png_bytep sp, dp;
1031 int sshift, dshift;
1032 int s_start, s_end, s_inc;
1033 png_byte v;
1034 png_uint_32 i;
1035 int j;
1036
1037 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1038 dp = row + (png_size_t)((final_width - 1) >> 3);
1039 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1040 if (transformations & PNG_PACKSWAP)
1041 {
1042 sshift = (int)((row_info->width + 7) & 7);
1043 dshift = (int)((final_width + 7) & 7);
1044 s_start = 7;
1045 s_end = 0;
1046 s_inc = -1;
1047 }
1048 else
1049 #endif
1050 {
1051 sshift = 7 - (int)((row_info->width + 7) & 7);
1052 dshift = 7 - (int)((final_width + 7) & 7);
1053 s_start = 0;
1054 s_end = 7;
1055 s_inc = 1;
1056 }
1057
1058 for (i = row_info->width; i; i--)
1059 {
1060 v = (png_byte)((*sp >> sshift) & 0x1);
1061 for (j = 0; j < png_pass_inc[pass]; j++)
1062 {
1063 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1064 *dp |= (png_byte)(v << dshift);
1065 if (dshift == s_end)
1066 {
1067 dshift = s_start;
1068 dp--;
1069 }
1070 else
1071 dshift += s_inc;
1072 }
1073 if (sshift == s_end)
1074 {
1075 sshift = s_start;
1076 sp--;
1077 }
1078 else
1079 sshift += s_inc;
1080 }
1081 break;
1082 }
1083
1084 case 2:
1085 {
1086 png_bytep sp, dp;
1087 int sshift, dshift;
1088 int s_start, s_end, s_inc;
1089 png_uint_32 i;
1090
1091 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1092 dp = row + (png_size_t)((final_width - 1) >> 2);
1093 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1094 if (transformations & PNG_PACKSWAP)
1095 {
1096 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1097 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1098 s_start = 6;
1099 s_end = 0;
1100 s_inc = -2;
1101 }
1102 else
1103 #endif
1104 {
1105 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1106 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1107 s_start = 0;
1108 s_end = 6;
1109 s_inc = 2;
1110 }
1111
1112 for (i = row_info->width; i; i--)
1113 {
1114 png_byte v;
1115 int j;
1116
1117 v = (png_byte)((*sp >> sshift) & 0x3);
1118 for (j = 0; j < png_pass_inc[pass]; j++)
1119 {
1120 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1121 *dp |= (png_byte)(v << dshift);
1122 if (dshift == s_end)
1123 {
1124 dshift = s_start;
1125 dp--;
1126 }
1127 else
1128 dshift += s_inc;
1129 }
1130 if (sshift == s_end)
1131 {
1132 sshift = s_start;
1133 sp--;
1134 }
1135 else
1136 sshift += s_inc;
1137 }
1138 break;
1139 }
1140
1141 case 4:
1142 {
1143 png_bytep sp, dp;
1144 int sshift, dshift;
1145 int s_start, s_end, s_inc;
1146 png_uint_32 i;
1147
1148 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1149 dp = row + (png_size_t)((final_width - 1) >> 1);
1150 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1151 if (transformations & PNG_PACKSWAP)
1152 {
1153 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1154 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1155 s_start = 4;
1156 s_end = 0;
1157 s_inc = -4;
1158 }
1159 else
1160 #endif
1161 {
1162 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1163 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1164 s_start = 0;
1165 s_end = 4;
1166 s_inc = 4;
1167 }
1168
1169 for (i = row_info->width; i; i--)
1170 {
1171 png_byte v;
1172 int j;
1173
1174 v = (png_byte)((*sp >> sshift) & 0xf);
1175 for (j = 0; j < png_pass_inc[pass]; j++)
1176 {
1177 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1178 *dp |= (png_byte)(v << dshift);
1179 if (dshift == s_end)
1180 {
1181 dshift = s_start;
1182 dp--;
1183 }
1184 else
1185 dshift += s_inc;
1186 }
1187 if (sshift == s_end)
1188 {
1189 sshift = s_start;
1190 sp--;
1191 }
1192 else
1193 sshift += s_inc;
1194 }
1195 break;
1196 }
1197
1198 default: /* This is the place where the routine is modified */
1199 {
1200 __int64 const4 = 0x0000000000FFFFFF;
1201 /* __int64 const5 = 0x000000FFFFFF0000; // unused... */
1202 __int64 const6 = 0x00000000000000FF;
1203 png_bytep sptr, dp;
1204 png_uint_32 i;
1205 png_size_t pixel_bytes;
1206 int width = row_info->width;
1207
1208 pixel_bytes = (row_info->pixel_depth >> 3);
1209
1210 sptr = row + (width - 1) * pixel_bytes;
1211 dp = row + (final_width - 1) * pixel_bytes;
1212 /* New code by Nirav Chhatrapati - Intel Corporation */
1213 /* sign fix by GRR */
1214 /* NOTE: there is NO MMX code for 48-bit and 64-bit images */
1215
1216 // use MMX routine if machine supports it
1217 #if !defined(PNG_1_0_X)
1218 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1219 /* && mmx_supported */ )
1220 #else
1221 if (mmx_supported)
1222 #endif
1223 {
1224 if (pixel_bytes == 3)
1225 {
1226 if (((pass == 0) || (pass == 1)) && width)
1227 {
1228 _asm
1229 {
1230 mov esi, sptr
1231 mov edi, dp
1232 mov ecx, width
1233 sub edi, 21 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1234 loop_pass0:
1235 movd mm0, [esi] ; X X X X X v2 v1 v0
1236 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1237 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1238 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1239 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1240 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1241 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1242 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1243 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1244 movq mm3, mm0 ; v2 v1 v0 v2 v1 v0 v2 v1
1245 psllq mm0, 16 ; v0 v2 v1 v0 v2 v1 0 0
1246 movq mm4, mm3 ; v2 v1 v0 v2 v1 v0 v2 v1
1247 punpckhdq mm3, mm0 ; v0 v2 v1 v0 v2 v1 v0 v2
1248 movq [edi+16] , mm4
1249 psrlq mm0, 32 ; 0 0 0 0 v0 v2 v1 v0
1250 movq [edi+8] , mm3
1251 punpckldq mm0, mm4 ; v1 v0 v2 v1 v0 v2 v1 v0
1252 sub esi, 3
1253 movq [edi], mm0
1254 sub edi, 24
1255 /*sub esi, 3 */
1256 dec ecx
1257 jnz loop_pass0
1258 EMMS
1259 }
1260 }
1261 else if (((pass == 2) || (pass == 3)) && width)
1262 {
1263 _asm
1264 {
1265 mov esi, sptr
1266 mov edi, dp
1267 mov ecx, width
1268 sub edi, 9 /* (png_pass_inc[pass] - 1)*pixel_bytes */
1269 loop_pass2:
1270 movd mm0, [esi] ; X X X X X v2 v1 v0
1271 pand mm0, const4 ; 0 0 0 0 0 v2 v1 v0
1272 movq mm1, mm0 ; 0 0 0 0 0 v2 v1 v0
1273 psllq mm0, 16 ; 0 0 0 v2 v1 v0 0 0
1274 movq mm2, mm0 ; 0 0 0 v2 v1 v0 0 0
1275 psllq mm0, 24 ; v2 v1 v0 0 0 0 0 0
1276 psrlq mm1, 8 ; 0 0 0 0 0 0 v2 v1
1277 por mm0, mm2 ; v2 v1 v0 v2 v1 v0 0 0
1278 por mm0, mm1 ; v2 v1 v0 v2 v1 v0 v2 v1
1279 movq [edi+4], mm0 ; move to memory
1280 psrlq mm0, 16 ; 0 0 v2 v1 v0 v2 v1 v0
1281 movd [edi], mm0 ; move to memory
1282 sub esi, 3
1283 sub edi, 12
1284 dec ecx
1285 jnz loop_pass2
1286 EMMS
1287 }
1288 }
1289 else if (width) /* && ((pass == 4) || (pass == 5)) */
1290 {
1291 int width_mmx = ((width >> 1) << 1) - 8;
1292 if (width_mmx < 0)
1293 width_mmx = 0;
1294 width -= width_mmx; /* 8 or 9 pix, 24 or 27 bytes */
1295 if (width_mmx)
1296 {
1297 _asm
1298 {
1299 mov esi, sptr
1300 mov edi, dp
1301 mov ecx, width_mmx
1302 sub esi, 3
1303 sub edi, 9
1304 loop_pass4:
1305 movq mm0, [esi] ; X X v2 v1 v0 v5 v4 v3
1306 movq mm7, mm0 ; X X v2 v1 v0 v5 v4 v3
1307 movq mm6, mm0 ; X X v2 v1 v0 v5 v4 v3
1308 psllq mm0, 24 ; v1 v0 v5 v4 v3 0 0 0
1309 pand mm7, const4 ; 0 0 0 0 0 v5 v4 v3
1310 psrlq mm6, 24 ; 0 0 0 X X v2 v1 v0
1311 por mm0, mm7 ; v1 v0 v5 v4 v3 v5 v4 v3
1312 movq mm5, mm6 ; 0 0 0 X X v2 v1 v0
1313 psllq mm6, 8 ; 0 0 X X v2 v1 v0 0
1314 movq [edi], mm0 ; move quad to memory
1315 psrlq mm5, 16 ; 0 0 0 0 0 X X v2
1316 pand mm5, const6 ; 0 0 0 0 0 0 0 v2
1317 por mm6, mm5 ; 0 0 X X v2 v1 v0 v2
1318 movd [edi+8], mm6 ; move double to memory
1319 sub esi, 6
1320 sub edi, 12
1321 sub ecx, 2
1322 jnz loop_pass4
1323 EMMS
1324 }
1325 }
1326
1327 sptr -= width_mmx*3;
1328 dp -= width_mmx*6;
1329 for (i = width; i; i--)
1330 {
1331 png_byte v[8];
1332 int j;
1333
1334 png_memcpy(v, sptr, 3);
1335 for (j = 0; j < png_pass_inc[pass]; j++)
1336 {
1337 png_memcpy(dp, v, 3);
1338 dp -= 3;
1339 }
1340 sptr -= 3;
1341 }
1342 }
1343 } /* end of pixel_bytes == 3 */
1344
1345 else if (pixel_bytes == 1)
1346 {
1347 if (((pass == 0) || (pass == 1)) && width)
1348 {
1349 int width_mmx = ((width >> 2) << 2);
1350 width -= width_mmx;
1351 if (width_mmx)
1352 {
1353 _asm
1354 {
1355 mov esi, sptr
1356 mov edi, dp
1357 mov ecx, width_mmx
1358 sub edi, 31
1359 sub esi, 3
1360 loop1_pass0:
1361 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1362 movq mm1, mm0 ; X X X X v0 v1 v2 v3
1363 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1364 movq mm2, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1365 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1366 movq mm3, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1367 punpckldq mm0, mm0 ; v3 v3 v3 v3 v3 v3 v3 v3
1368 punpckhdq mm3, mm3 ; v2 v2 v2 v2 v2 v2 v2 v2
1369 movq [edi], mm0 ; move to memory v3
1370 punpckhwd mm2, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1371 movq [edi+8], mm3 ; move to memory v2
1372 movq mm4, mm2 ; v0 v0 v0 v0 v1 v1 v1 v1
1373 punpckldq mm2, mm2 ; v1 v1 v1 v1 v1 v1 v1 v1
1374 punpckhdq mm4, mm4 ; v0 v0 v0 v0 v0 v0 v0 v0
1375 movq [edi+16], mm2 ; move to memory v1
1376 movq [edi+24], mm4 ; move to memory v0
1377 sub esi, 4
1378 sub edi, 32
1379 sub ecx, 4
1380 jnz loop1_pass0
1381 EMMS
1382 }
1383 }
1384
1385 sptr -= width_mmx;
1386 dp -= width_mmx*8;
1387 for (i = width; i; i--)
1388 {
1389 int j;
1390
1391 /* I simplified this part in version 1.0.4e
1392 * here and in several other instances where
1393 * pixel_bytes == 1 -- GR-P
1394 *
1395 * Original code:
1396 *
1397 * png_byte v[8];
1398 * png_memcpy(v, sptr, pixel_bytes);
1399 * for (j = 0; j < png_pass_inc[pass]; j++)
1400 * {
1401 * png_memcpy(dp, v, pixel_bytes);
1402 * dp -= pixel_bytes;
1403 * }
1404 * sptr -= pixel_bytes;
1405 *
1406 * Replacement code is in the next three lines:
1407 */
1408
1409 for (j = 0; j < png_pass_inc[pass]; j++)
1410 *dp-- = *sptr;
1411 sptr--;
1412 }
1413 }
1414 else if (((pass == 2) || (pass == 3)) && width)
1415 {
1416 int width_mmx = ((width >> 2) << 2);
1417 width -= width_mmx;
1418 if (width_mmx)
1419 {
1420 _asm
1421 {
1422 mov esi, sptr
1423 mov edi, dp
1424 mov ecx, width_mmx
1425 sub edi, 15
1426 sub esi, 3
1427 loop1_pass2:
1428 movd mm0, [esi] ; X X X X v0 v1 v2 v3
1429 punpcklbw mm0, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1430 movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3
1431 punpcklwd mm0, mm0 ; v2 v2 v2 v2 v3 v3 v3 v3
1432 punpckhwd mm1, mm1 ; v0 v0 v0 v0 v1 v1 v1 v1
1433 movq [edi], mm0 ; move to memory v2 and v3
1434 sub esi, 4
1435 movq [edi+8], mm1 ; move to memory v1 and v0
1436 sub edi, 16
1437 sub ecx, 4
1438 jnz loop1_pass2
1439 EMMS
1440 }
1441 }
1442
1443 sptr -= width_mmx;
1444 dp -= width_mmx*4;
1445 for (i = width; i; i--)
1446 {
1447 int j;
1448
1449 for (j = 0; j < png_pass_inc[pass]; j++)
1450 {
1451 *dp-- = *sptr;
1452 }
1453 sptr --;
1454 }
1455 }
1456 else if (width) /* && ((pass == 4) || (pass == 5))) */
1457 {
1458 int width_mmx = ((width >> 3) << 3);
1459 width -= width_mmx;
1460 if (width_mmx)
1461 {
1462 _asm
1463 {
1464 mov esi, sptr
1465 mov edi, dp
1466 mov ecx, width_mmx
1467 sub edi, 15
1468 sub esi, 7
1469 loop1_pass4:
1470 movq mm0, [esi] ; v0 v1 v2 v3 v4 v5 v6 v7
1471 movq mm1, mm0 ; v0 v1 v2 v3 v4 v5 v6 v7
1472 punpcklbw mm0, mm0 ; v4 v4 v5 v5 v6 v6 v7 v7
1473 /*movq mm1, mm0 ; v0 v0 v1 v1 v2 v2 v3 v3 */
1474 punpckhbw mm1, mm1 ;v0 v0 v1 v1 v2 v2 v3 v3
1475 movq [edi+8], mm1 ; move to memory v0 v1 v2 and v3
1476 sub esi, 8
1477 movq [edi], mm0 ; move to memory v4 v5 v6 and v7
1478 /*sub esi, 4 */
1479 sub edi, 16
1480 sub ecx, 8
1481 jnz loop1_pass4
1482 EMMS
1483 }
1484 }
1485
1486 sptr -= width_mmx;
1487 dp -= width_mmx*2;
1488 for (i = width; i; i--)
1489 {
1490 int j;
1491
1492 for (j = 0; j < png_pass_inc[pass]; j++)
1493 {
1494 *dp-- = *sptr;
1495 }
1496 sptr --;
1497 }
1498 }
1499 } /* end of pixel_bytes == 1 */
1500
1501 else if (pixel_bytes == 2)
1502 {
1503 if (((pass == 0) || (pass == 1)) && width)
1504 {
1505 int width_mmx = ((width >> 1) << 1);
1506 width -= width_mmx;
1507 if (width_mmx)
1508 {
1509 _asm
1510 {
1511 mov esi, sptr
1512 mov edi, dp
1513 mov ecx, width_mmx
1514 sub esi, 2
1515 sub edi, 30
1516 loop2_pass0:
1517 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1518 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1519 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1520 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1521 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1522 movq [edi], mm0
1523 movq [edi + 8], mm0
1524 movq [edi + 16], mm1
1525 movq [edi + 24], mm1
1526 sub esi, 4
1527 sub edi, 32
1528 sub ecx, 2
1529 jnz loop2_pass0
1530 EMMS
1531 }
1532 }
1533
1534 sptr -= (width_mmx*2 - 2); /* sign fixed */
1535 dp -= (width_mmx*16 - 2); /* sign fixed */
1536 for (i = width; i; i--)
1537 {
1538 png_byte v[8];
1539 int j;
1540 sptr -= 2;
1541 png_memcpy(v, sptr, 2);
1542 for (j = 0; j < png_pass_inc[pass]; j++)
1543 {
1544 dp -= 2;
1545 png_memcpy(dp, v, 2);
1546 }
1547 }
1548 }
1549 else if (((pass == 2) || (pass == 3)) && width)
1550 {
1551 int width_mmx = ((width >> 1) << 1) ;
1552 width -= width_mmx;
1553 if (width_mmx)
1554 {
1555 _asm
1556 {
1557 mov esi, sptr
1558 mov edi, dp
1559 mov ecx, width_mmx
1560 sub esi, 2
1561 sub edi, 14
1562 loop2_pass2:
1563 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1564 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1565 movq mm1, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1566 punpckldq mm0, mm0 ; v3 v2 v3 v2 v3 v2 v3 v2
1567 punpckhdq mm1, mm1 ; v1 v0 v1 v0 v1 v0 v1 v0
1568 movq [edi], mm0
1569 sub esi, 4
1570 movq [edi + 8], mm1
1571 /*sub esi, 4 */
1572 sub edi, 16
1573 sub ecx, 2
1574 jnz loop2_pass2
1575 EMMS
1576 }
1577 }
1578
1579 sptr -= (width_mmx*2 - 2); /* sign fixed */
1580 dp -= (width_mmx*8 - 2); /* sign fixed */
1581 for (i = width; i; i--)
1582 {
1583 png_byte v[8];
1584 int j;
1585 sptr -= 2;
1586 png_memcpy(v, sptr, 2);
1587 for (j = 0; j < png_pass_inc[pass]; j++)
1588 {
1589 dp -= 2;
1590 png_memcpy(dp, v, 2);
1591 }
1592 }
1593 }
1594 else if (width) /* pass == 4 or 5 */
1595 {
1596 int width_mmx = ((width >> 1) << 1) ;
1597 width -= width_mmx;
1598 if (width_mmx)
1599 {
1600 _asm
1601 {
1602 mov esi, sptr
1603 mov edi, dp
1604 mov ecx, width_mmx
1605 sub esi, 2
1606 sub edi, 6
1607 loop2_pass4:
1608 movd mm0, [esi] ; X X X X v1 v0 v3 v2
1609 punpcklwd mm0, mm0 ; v1 v0 v1 v0 v3 v2 v3 v2
1610 sub esi, 4
1611 movq [edi], mm0
1612 sub edi, 8
1613 sub ecx, 2
1614 jnz loop2_pass4
1615 EMMS
1616 }
1617 }
1618
1619 sptr -= (width_mmx*2 - 2); /* sign fixed */
1620 dp -= (width_mmx*4 - 2); /* sign fixed */
1621 for (i = width; i; i--)
1622 {
1623 png_byte v[8];
1624 int j;
1625 sptr -= 2;
1626 png_memcpy(v, sptr, 2);
1627 for (j = 0; j < png_pass_inc[pass]; j++)
1628 {
1629 dp -= 2;
1630 png_memcpy(dp, v, 2);
1631 }
1632 }
1633 }
1634 } /* end of pixel_bytes == 2 */
1635
1636 else if (pixel_bytes == 4)
1637 {
1638 if (((pass == 0) || (pass == 1)) && width)
1639 {
1640 int width_mmx = ((width >> 1) << 1) ;
1641 width -= width_mmx;
1642 if (width_mmx)
1643 {
1644 _asm
1645 {
1646 mov esi, sptr
1647 mov edi, dp
1648 mov ecx, width_mmx
1649 sub esi, 4
1650 sub edi, 60
1651 loop4_pass0:
1652 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1653 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1654 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1655 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1656 movq [edi], mm0
1657 movq [edi + 8], mm0
1658 movq [edi + 16], mm0
1659 movq [edi + 24], mm0
1660 movq [edi+32], mm1
1661 movq [edi + 40], mm1
1662 movq [edi+ 48], mm1
1663 sub esi, 8
1664 movq [edi + 56], mm1
1665 sub edi, 64
1666 sub ecx, 2
1667 jnz loop4_pass0
1668 EMMS
1669 }
1670 }
1671
1672 sptr -= (width_mmx*4 - 4); /* sign fixed */
1673 dp -= (width_mmx*32 - 4); /* sign fixed */
1674 for (i = width; i; i--)
1675 {
1676 png_byte v[8];
1677 int j;
1678 sptr -= 4;
1679 png_memcpy(v, sptr, 4);
1680 for (j = 0; j < png_pass_inc[pass]; j++)
1681 {
1682 dp -= 4;
1683 png_memcpy(dp, v, 4);
1684 }
1685 }
1686 }
1687 else if (((pass == 2) || (pass == 3)) && width)
1688 {
1689 int width_mmx = ((width >> 1) << 1) ;
1690 width -= width_mmx;
1691 if (width_mmx)
1692 {
1693 _asm
1694 {
1695 mov esi, sptr
1696 mov edi, dp
1697 mov ecx, width_mmx
1698 sub esi, 4
1699 sub edi, 28
1700 loop4_pass2:
1701 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1702 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1703 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1704 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1705 movq [edi], mm0
1706 movq [edi + 8], mm0
1707 movq [edi+16], mm1
1708 movq [edi + 24], mm1
1709 sub esi, 8
1710 sub edi, 32
1711 sub ecx, 2
1712 jnz loop4_pass2
1713 EMMS
1714 }
1715 }
1716
1717 sptr -= (width_mmx*4 - 4); /* sign fixed */
1718 dp -= (width_mmx*16 - 4); /* sign fixed */
1719 for (i = width; i; i--)
1720 {
1721 png_byte v[8];
1722 int j;
1723 sptr -= 4;
1724 png_memcpy(v, sptr, 4);
1725 for (j = 0; j < png_pass_inc[pass]; j++)
1726 {
1727 dp -= 4;
1728 png_memcpy(dp, v, 4);
1729 }
1730 }
1731 }
1732 else if (width) /* pass == 4 or 5 */
1733 {
1734 int width_mmx = ((width >> 1) << 1) ;
1735 width -= width_mmx;
1736 if (width_mmx)
1737 {
1738 _asm
1739 {
1740 mov esi, sptr
1741 mov edi, dp
1742 mov ecx, width_mmx
1743 sub esi, 4
1744 sub edi, 12
1745 loop4_pass4:
1746 movq mm0, [esi] ; v3 v2 v1 v0 v7 v6 v5 v4
1747 movq mm1, mm0 ; v3 v2 v1 v0 v7 v6 v5 v4
1748 punpckldq mm0, mm0 ; v7 v6 v5 v4 v7 v6 v5 v4
1749 punpckhdq mm1, mm1 ; v3 v2 v1 v0 v3 v2 v1 v0
1750 movq [edi], mm0
1751 sub esi, 8
1752 movq [edi + 8], mm1
1753 sub edi, 16
1754 sub ecx, 2
1755 jnz loop4_pass4
1756 EMMS
1757 }
1758 }
1759
1760 sptr -= (width_mmx*4 - 4); /* sign fixed */
1761 dp -= (width_mmx*8 - 4); /* sign fixed */
1762 for (i = width; i; i--)
1763 {
1764 png_byte v[8];
1765 int j;
1766 sptr -= 4;
1767 png_memcpy(v, sptr, 4);
1768 for (j = 0; j < png_pass_inc[pass]; j++)
1769 {
1770 dp -= 4;
1771 png_memcpy(dp, v, 4);
1772 }
1773 }
1774 }
1775
1776 } /* end of pixel_bytes == 4 */
1777
1778 else if (pixel_bytes == 6)
1779 {
1780 for (i = width; i; i--)
1781 {
1782 png_byte v[8];
1783 int j;
1784 png_memcpy(v, sptr, 6);
1785 for (j = 0; j < png_pass_inc[pass]; j++)
1786 {
1787 png_memcpy(dp, v, 6);
1788 dp -= 6;
1789 }
1790 sptr -= 6;
1791 }
1792 } /* end of pixel_bytes == 6 */
1793
1794 else
1795 {
1796 for (i = width; i; i--)
1797 {
1798 png_byte v[8];
1799 int j;
1800 png_memcpy(v, sptr, pixel_bytes);
1801 for (j = 0; j < png_pass_inc[pass]; j++)
1802 {
1803 png_memcpy(dp, v, pixel_bytes);
1804 dp -= pixel_bytes;
1805 }
1806 sptr-= pixel_bytes;
1807 }
1808 }
1809 } /* end of mmx_supported */
1810
1811 else /* MMX not supported: use modified C code - takes advantage
1812 * of inlining of memcpy for a constant */
1813 {
1814 if (pixel_bytes == 1)
1815 {
1816 for (i = width; i; i--)
1817 {
1818 int j;
1819 for (j = 0; j < png_pass_inc[pass]; j++)
1820 *dp-- = *sptr;
1821 sptr--;
1822 }
1823 }
1824 else if (pixel_bytes == 3)
1825 {
1826 for (i = width; i; i--)
1827 {
1828 png_byte v[8];
1829 int j;
1830 png_memcpy(v, sptr, pixel_bytes);
1831 for (j = 0; j < png_pass_inc[pass]; j++)
1832 {
1833 png_memcpy(dp, v, pixel_bytes);
1834 dp -= pixel_bytes;
1835 }
1836 sptr -= pixel_bytes;
1837 }
1838 }
1839 else if (pixel_bytes == 2)
1840 {
1841 for (i = width; i; i--)
1842 {
1843 png_byte v[8];
1844 int j;
1845 png_memcpy(v, sptr, pixel_bytes);
1846 for (j = 0; j < png_pass_inc[pass]; j++)
1847 {
1848 png_memcpy(dp, v, pixel_bytes);
1849 dp -= pixel_bytes;
1850 }
1851 sptr -= pixel_bytes;
1852 }
1853 }
1854 else if (pixel_bytes == 4)
1855 {
1856 for (i = width; i; i--)
1857 {
1858 png_byte v[8];
1859 int j;
1860 png_memcpy(v, sptr, pixel_bytes);
1861 for (j = 0; j < png_pass_inc[pass]; j++)
1862 {
1863 png_memcpy(dp, v, pixel_bytes);
1864 dp -= pixel_bytes;
1865 }
1866 sptr -= pixel_bytes;
1867 }
1868 }
1869 else if (pixel_bytes == 6)
1870 {
1871 for (i = width; i; i--)
1872 {
1873 png_byte v[8];
1874 int j;
1875 png_memcpy(v, sptr, pixel_bytes);
1876 for (j = 0; j < png_pass_inc[pass]; j++)
1877 {
1878 png_memcpy(dp, v, pixel_bytes);
1879 dp -= pixel_bytes;
1880 }
1881 sptr -= pixel_bytes;
1882 }
1883 }
1884 else
1885 {
1886 for (i = width; i; i--)
1887 {
1888 png_byte v[8];
1889 int j;
1890 png_memcpy(v, sptr, pixel_bytes);
1891 for (j = 0; j < png_pass_inc[pass]; j++)
1892 {
1893 png_memcpy(dp, v, pixel_bytes);
1894 dp -= pixel_bytes;
1895 }
1896 sptr -= pixel_bytes;
1897 }
1898 }
1899
1900 } /* end of MMX not supported */
1901 break;
1902 }
1903 } /* end switch (row_info->pixel_depth) */
1904
1905 row_info->width = final_width;
1906
1907 row_info->rowbytes = PNG_ROWBYTES(row_info->pixel_depth,final_width);
1908 }
1909
1910 }
1911
1912 #endif /* PNG_READ_INTERLACING_SUPPORTED */
1913
1914
1915 /* These variables are utilized in the functions below. They are declared */
1916 /* globally here to ensure alignment on 8-byte boundaries. */
1917
1918 union uAll {
1919 __int64 use;
1920 double align;
1921 } LBCarryMask = {0x0101010101010101},
1922 HBClearMask = {0x7f7f7f7f7f7f7f7f},
1923 ActiveMask, ActiveMask2, ActiveMaskEnd, ShiftBpp, ShiftRem;
1924
1925
1926 /* Optimized code for PNG Average filter decoder */
1927 void /* PRIVATE */
1928 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row
1929 , png_bytep prev_row)
1930 {
1931 int bpp;
1932 png_uint_32 FullLength;
1933 png_uint_32 MMXLength;
1934 /*png_uint_32 len; */
1935 int diff;
1936
1937 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
1938 FullLength = row_info->rowbytes; /* # of bytes to filter */
1939 _asm {
1940 /* Init address pointers and offset */
1941 mov edi, row /* edi ==> Avg(x) */
1942 xor ebx, ebx /* ebx ==> x */
1943 mov edx, edi
1944 mov esi, prev_row /* esi ==> Prior(x) */
1945 sub edx, bpp /* edx ==> Raw(x-bpp) */
1946
1947 xor eax, eax
1948 /* Compute the Raw value for the first bpp bytes */
1949 /* Raw(x) = Avg(x) + (Prior(x)/2) */
1950 davgrlp:
1951 mov al, [esi + ebx] /* Load al with Prior(x) */
1952 inc ebx
1953 shr al, 1 /* divide by 2 */
1954 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
1955 cmp ebx, bpp
1956 mov [edi+ebx-1], al /* Write back Raw(x); */
1957 /* mov does not affect flags; -1 to offset inc ebx */
1958 jb davgrlp
1959 /* get # of bytes to alignment */
1960 mov diff, edi /* take start of row */
1961 add diff, ebx /* add bpp */
1962 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */
1963 and diff, 0xfffffff8 /* mask to alignment boundary */
1964 sub diff, edi /* subtract from start ==> value ebx at alignment */
1965 jz davggo
1966 /* fix alignment */
1967 /* Compute the Raw value for the bytes upto the alignment boundary */
1968 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
1969 xor ecx, ecx
1970 davglp1:
1971 xor eax, eax
1972 mov cl, [esi + ebx] /* load cl with Prior(x) */
1973 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
1974 add ax, cx
1975 inc ebx
1976 shr ax, 1 /* divide by 2 */
1977 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
1978 cmp ebx, diff /* Check if at alignment boundary */
1979 mov [edi+ebx-1], al /* Write back Raw(x); */
1980 /* mov does not affect flags; -1 to offset inc ebx */
1981 jb davglp1 /* Repeat until at alignment boundary */
1982 davggo:
1983 mov eax, FullLength
1984 mov ecx, eax
1985 sub eax, ebx /* subtract alignment fix */
1986 and eax, 0x00000007 /* calc bytes over mult of 8 */
1987 sub ecx, eax /* drop over bytes from original length */
1988 mov MMXLength, ecx
1989 } /* end _asm block */
1990 /* Now do the math for the rest of the row */
1991 switch ( bpp )
1992 {
1993 case 3:
1994 {
1995 ActiveMask.use = 0x0000000000ffffff;
1996 ShiftBpp.use = 24; /* == 3 * 8 */
1997 ShiftRem.use = 40; /* == 64 - 24 */
1998 _asm {
1999 /* Re-init address pointers and offset */
2000 movq mm7, ActiveMask
2001 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2002 movq mm5, LBCarryMask
2003 mov edi, row /* edi ==> Avg(x) */
2004 movq mm4, HBClearMask
2005 mov esi, prev_row /* esi ==> Prior(x) */
2006 /* PRIME the pump (load the first Raw(x-bpp) data set */
2007 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2008 /* (we correct position in loop below) */
2009 davg3lp:
2010 movq mm0, [edi + ebx] /* Load mm0 with Avg(x) */
2011 /* Add (Prev_row/2) to Average */
2012 movq mm3, mm5
2013 psrlq mm2, ShiftRem /* Correct position Raw(x-bpp) data */
2014 movq mm1, [esi + ebx] /* Load mm1 with Prior(x) */
2015 movq mm6, mm7
2016 pand mm3, mm1 /* get lsb for each prev_row byte */
2017 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2018 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2019 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2020 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2021 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2022 pand mm1, mm2 /* get LBCarrys for each byte where both */
2023 /* lsb's were == 1 (Only valid for active group) */
2024 psrlq mm2, 1 /* divide raw bytes by 2 */
2025 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2026 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2027 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */
2028 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2029 /* byte */
2030 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2031 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 3-5 */
2032 movq mm2, mm0 /* mov updated Raws to mm2 */
2033 psllq mm2, ShiftBpp /* shift data to position correctly */
2034 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2035 pand mm1, mm2 /* get LBCarrys for each byte where both */
2036 /* lsb's were == 1 (Only valid for active group) */
2037 psrlq mm2, 1 /* divide raw bytes by 2 */
2038 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2039 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2040 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2041 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2042 /* byte */
2043
2044 /* Add 3rd active group (Raw(x-bpp)/2) to Average with LBCarry */
2045 psllq mm6, ShiftBpp /* shift the mm6 mask to cover the last two */
2046 /* bytes */
2047 movq mm2, mm0 /* mov updated Raws to mm2 */
2048 psllq mm2, ShiftBpp /* shift data to position correctly */
2049 /* Data only needs to be shifted once here to */
2050 /* get the correct x-bpp offset. */
2051 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2052 pand mm1, mm2 /* get LBCarrys for each byte where both */
2053 /* lsb's were == 1 (Only valid for active group) */
2054 psrlq mm2, 1 /* divide raw bytes by 2 */
2055 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2056 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2057 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2058 add ebx, 8
2059 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2060 /* byte */
2061
2062 /* Now ready to write back to memory */
2063 movq [edi + ebx - 8], mm0
2064 /* Move updated Raw(x) to use as Raw(x-bpp) for next loop */
2065 cmp ebx, MMXLength
2066 movq mm2, mm0 /* mov updated Raw(x) to mm2 */
2067 jb davg3lp
2068 } /* end _asm block */
2069 }
2070 break;
2071
2072 case 6:
2073 case 4:
2074 case 7:
2075 case 5:
2076 {
2077 ActiveMask.use = 0xffffffffffffffff; /* use shift below to clear */
2078 /* appropriate inactive bytes */
2079 ShiftBpp.use = bpp << 3;
2080 ShiftRem.use = 64 - ShiftBpp.use;
2081 _asm {
2082 movq mm4, HBClearMask
2083 /* Re-init address pointers and offset */
2084 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2085 /* Load ActiveMask and clear all bytes except for 1st active group */
2086 movq mm7, ActiveMask
2087 mov edi, row /* edi ==> Avg(x) */
2088 psrlq mm7, ShiftRem
2089 mov esi, prev_row /* esi ==> Prior(x) */
2090 movq mm6, mm7
2091 movq mm5, LBCarryMask
2092 psllq mm6, ShiftBpp /* Create mask for 2nd active group */
2093 /* PRIME the pump (load the first Raw(x-bpp) data set */
2094 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2095 /* (we correct position in loop below) */
2096 davg4lp:
2097 movq mm0, [edi + ebx]
2098 psrlq mm2, ShiftRem /* shift data to position correctly */
2099 movq mm1, [esi + ebx]
2100 /* Add (Prev_row/2) to Average */
2101 movq mm3, mm5
2102 pand mm3, mm1 /* get lsb for each prev_row byte */
2103 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2104 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2105 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2106 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2107 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2108 pand mm1, mm2 /* get LBCarrys for each byte where both */
2109 /* lsb's were == 1 (Only valid for active group) */
2110 psrlq mm2, 1 /* divide raw bytes by 2 */
2111 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2112 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2113 pand mm2, mm7 /* Leave only Active Group 1 bytes to add to Avg */
2114 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2115 /* byte */
2116 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2117 movq mm2, mm0 /* mov updated Raws to mm2 */
2118 psllq mm2, ShiftBpp /* shift data to position correctly */
2119 add ebx, 8
2120 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2121 pand mm1, mm2 /* get LBCarrys for each byte where both */
2122 /* lsb's were == 1 (Only valid for active group) */
2123 psrlq mm2, 1 /* divide raw bytes by 2 */
2124 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2125 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2126 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2127 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active */
2128 /* byte */
2129 cmp ebx, MMXLength
2130 /* Now ready to write back to memory */
2131 movq [edi + ebx - 8], mm0
2132 /* Prep Raw(x-bpp) for next loop */
2133 movq mm2, mm0 /* mov updated Raws to mm2 */
2134 jb davg4lp
2135 } /* end _asm block */
2136 }
2137 break;
2138 case 2:
2139 {
2140 ActiveMask.use = 0x000000000000ffff;
2141 ShiftBpp.use = 16; /* == 2 * 8 [BUGFIX] */
2142 ShiftRem.use = 48; /* == 64 - 16 [BUGFIX] */
2143 _asm {
2144 /* Load ActiveMask */
2145 movq mm7, ActiveMask
2146 /* Re-init address pointers and offset */
2147 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2148 movq mm5, LBCarryMask
2149 mov edi, row /* edi ==> Avg(x) */
2150 movq mm4, HBClearMask
2151 mov esi, prev_row /* esi ==> Prior(x) */
2152 /* PRIME the pump (load the first Raw(x-bpp) data set */
2153 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2154 /* (we correct position in loop below) */
2155 davg2lp:
2156 movq mm0, [edi + ebx]
2157 psrlq mm2, ShiftRem /* shift data to position correctly [BUGFIX] */
2158 movq mm1, [esi + ebx]
2159 /* Add (Prev_row/2) to Average */
2160 movq mm3, mm5
2161 pand mm3, mm1 /* get lsb for each prev_row byte */
2162 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2163 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2164 movq mm6, mm7
2165 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2166 /* Add 1st active group (Raw(x-bpp)/2) to Average with LBCarry */
2167 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2168 pand mm1, mm2 /* get LBCarrys for each byte where both */
2169 /* lsb's were == 1 (Only valid for active group) */
2170 psrlq mm2, 1 /* divide raw bytes by 2 */
2171 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2172 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2173 pand mm2, mm6 /* Leave only Active Group 1 bytes to add to Avg */
2174 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2175 /* Add 2nd active group (Raw(x-bpp)/2) to Average with LBCarry */
2176 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 2 & 3 */
2177 movq mm2, mm0 /* mov updated Raws to mm2 */
2178 psllq mm2, ShiftBpp /* shift data to position correctly */
2179 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2180 pand mm1, mm2 /* get LBCarrys for each byte where both */
2181 /* lsb's were == 1 (Only valid for active group) */
2182 psrlq mm2, 1 /* divide raw bytes by 2 */
2183 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2184 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2185 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2186 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2187
2188 /* Add rdd active group (Raw(x-bpp)/2) to Average with LBCarry */
2189 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 4 & 5 */
2190 movq mm2, mm0 /* mov updated Raws to mm2 */
2191 psllq mm2, ShiftBpp /* shift data to position correctly */
2192 /* Data only needs to be shifted once here to */
2193 /* get the correct x-bpp offset. */
2194 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2195 pand mm1, mm2 /* get LBCarrys for each byte where both */
2196 /* lsb's were == 1 (Only valid for active group) */
2197 psrlq mm2, 1 /* divide raw bytes by 2 */
2198 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2199 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2200 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2201 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2202
2203 /* Add 4th active group (Raw(x-bpp)/2) to Average with LBCarry */
2204 psllq mm6, ShiftBpp /* shift the mm6 mask to cover bytes 6 & 7 */
2205 movq mm2, mm0 /* mov updated Raws to mm2 */
2206 psllq mm2, ShiftBpp /* shift data to position correctly */
2207 /* Data only needs to be shifted once here to */
2208 /* get the correct x-bpp offset. */
2209 add ebx, 8
2210 movq mm1, mm3 /* now use mm1 for getting LBCarrys */
2211 pand mm1, mm2 /* get LBCarrys for each byte where both */
2212 /* lsb's were == 1 (Only valid for active group) */
2213 psrlq mm2, 1 /* divide raw bytes by 2 */
2214 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2215 paddb mm2, mm1 /* add LBCarrys to (Raw(x-bpp)/2) for each byte */
2216 pand mm2, mm6 /* Leave only Active Group 2 bytes to add to Avg */
2217 paddb mm0, mm2 /* add (Raw/2) + LBCarrys to Avg for each Active byte */
2218
2219 cmp ebx, MMXLength
2220 /* Now ready to write back to memory */
2221 movq [edi + ebx - 8], mm0
2222 /* Prep Raw(x-bpp) for next loop */
2223 movq mm2, mm0 /* mov updated Raws to mm2 */
2224 jb davg2lp
2225 } /* end _asm block */
2226 }
2227 break;
2228
2229 case 1: /* bpp == 1 */
2230 {
2231 _asm {
2232 /* Re-init address pointers and offset */
2233 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2234 mov edi, row /* edi ==> Avg(x) */
2235 cmp ebx, FullLength /* Test if offset at end of array */
2236 jnb davg1end
2237 /* Do Paeth decode for remaining bytes */
2238 mov esi, prev_row /* esi ==> Prior(x) */
2239 mov edx, edi
2240 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
2241 sub edx, bpp /* edx ==> Raw(x-bpp) */
2242 davg1lp:
2243 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2244 xor eax, eax
2245 mov cl, [esi + ebx] /* load cl with Prior(x) */
2246 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
2247 add ax, cx
2248 inc ebx
2249 shr ax, 1 /* divide by 2 */
2250 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
2251 cmp ebx, FullLength /* Check if at end of array */
2252 mov [edi+ebx-1], al /* Write back Raw(x); */
2253 /* mov does not affect flags; -1 to offset inc ebx */
2254 jb davg1lp
2255 davg1end:
2256 } /* end _asm block */
2257 }
2258 return;
2259
2260 case 8: /* bpp == 8 */
2261 {
2262 _asm {
2263 /* Re-init address pointers and offset */
2264 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2265 movq mm5, LBCarryMask
2266 mov edi, row /* edi ==> Avg(x) */
2267 movq mm4, HBClearMask
2268 mov esi, prev_row /* esi ==> Prior(x) */
2269 /* PRIME the pump (load the first Raw(x-bpp) data set */
2270 movq mm2, [edi + ebx - 8] /* Load previous aligned 8 bytes */
2271 /* (NO NEED to correct position in loop below) */
2272 davg8lp:
2273 movq mm0, [edi + ebx]
2274 movq mm3, mm5
2275 movq mm1, [esi + ebx]
2276 add ebx, 8
2277 pand mm3, mm1 /* get lsb for each prev_row byte */
2278 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2279 pand mm3, mm2 /* get LBCarrys for each byte where both */
2280 /* lsb's were == 1 */
2281 psrlq mm2, 1 /* divide raw bytes by 2 */
2282 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2283 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */
2284 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2285 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2286 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */
2287 cmp ebx, MMXLength
2288 movq [edi + ebx - 8], mm0
2289 movq mm2, mm0 /* reuse as Raw(x-bpp) */
2290 jb davg8lp
2291 } /* end _asm block */
2292 }
2293 break;
2294 default: /* bpp greater than 8 */
2295 {
2296 _asm {
2297 movq mm5, LBCarryMask
2298 /* Re-init address pointers and offset */
2299 mov ebx, diff /* ebx ==> x = offset to alignment boundary */
2300 mov edi, row /* edi ==> Avg(x) */
2301 movq mm4, HBClearMask
2302 mov edx, edi
2303 mov esi, prev_row /* esi ==> Prior(x) */
2304 sub edx, bpp /* edx ==> Raw(x-bpp) */
2305 davgAlp:
2306 movq mm0, [edi + ebx]
2307 movq mm3, mm5
2308 movq mm1, [esi + ebx]
2309 pand mm3, mm1 /* get lsb for each prev_row byte */
2310 movq mm2, [edx + ebx]
2311 psrlq mm1, 1 /* divide prev_row bytes by 2 */
2312 pand mm3, mm2 /* get LBCarrys for each byte where both */
2313 /* lsb's were == 1 */
2314 psrlq mm2, 1 /* divide raw bytes by 2 */
2315 pand mm1, mm4 /* clear invalid bit 7 of each byte */
2316 paddb mm0, mm3 /* add LBCarrys to Avg for each byte */
2317 pand mm2, mm4 /* clear invalid bit 7 of each byte */
2318 paddb mm0, mm1 /* add (Prev_row/2) to Avg for each byte */
2319 add ebx, 8
2320 paddb mm0, mm2 /* add (Raw/2) to Avg for each byte */
2321 cmp ebx, MMXLength
2322 movq [edi + ebx - 8], mm0
2323 jb davgAlp
2324 } /* end _asm block */
2325 }
2326 break;
2327 } /* end switch ( bpp ) */
2328
2329 _asm {
2330 /* MMX acceleration complete now do clean-up */
2331 /* Check if any remaining bytes left to decode */
2332 mov ebx, MMXLength /* ebx ==> x = offset bytes remaining after MMX */
2333 mov edi, row /* edi ==> Avg(x) */
2334 cmp ebx, FullLength /* Test if offset at end of array */
2335 jnb davgend
2336 /* Do Paeth decode for remaining bytes */
2337 mov esi, prev_row /* esi ==> Prior(x) */
2338 mov edx, edi
2339 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
2340 sub edx, bpp /* edx ==> Raw(x-bpp) */
2341 davglp2:
2342 /* Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2) */
2343 xor eax, eax
2344 mov cl, [esi + ebx] /* load cl with Prior(x) */
2345 mov al, [edx + ebx] /* load al with Raw(x-bpp) */
2346 add ax, cx
2347 inc ebx
2348 shr ax, 1 /* divide by 2 */
2349 add al, [edi+ebx-1] /* Add Avg(x); -1 to offset inc ebx */
2350 cmp ebx, FullLength /* Check if at end of array */
2351 mov [edi+ebx-1], al /* Write back Raw(x); */
2352 /* mov does not affect flags; -1 to offset inc ebx */
2353 jb davglp2
2354 davgend:
2355 emms /* End MMX instructions; prep for possible FP instrs. */
2356 } /* end _asm block */
2357 }
2358
2359 /* Optimized code for PNG Paeth filter decoder */
2360 void /* PRIVATE */
2361 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
2362 png_bytep prev_row)
2363 {
2364 png_uint_32 FullLength;
2365 png_uint_32 MMXLength;
2366 /*png_uint_32 len; */
2367 int bpp;
2368 int diff;
2369 /*int ptemp; */
2370 int patemp, pbtemp, pctemp;
2371
2372 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
2373 FullLength = row_info->rowbytes; /* # of bytes to filter */
2374 _asm
2375 {
2376 xor ebx, ebx /* ebx ==> x offset */
2377 mov edi, row
2378 xor edx, edx /* edx ==> x-bpp offset */
2379 mov esi, prev_row
2380 xor eax, eax
2381
2382 /* Compute the Raw value for the first bpp bytes */
2383 /* Note: the formula works out to be always */
2384 /* Paeth(x) = Raw(x) + Prior(x) where x < bpp */
2385 dpthrlp:
2386 mov al, [edi + ebx]
2387 add al, [esi + ebx]
2388 inc ebx
2389 cmp ebx, bpp
2390 mov [edi + ebx - 1], al
2391 jb dpthrlp
2392 /* get # of bytes to alignment */
2393 mov diff, edi /* take start of row */
2394 add diff, ebx /* add bpp */
2395 xor ecx, ecx
2396 add diff, 0xf /* add 7 + 8 to incr past alignment boundary */
2397 and diff, 0xfffffff8 /* mask to alignment boundary */
2398 sub diff, edi /* subtract from start ==> value ebx at alignment */
2399 jz dpthgo
2400 /* fix alignment */
2401 dpthlp1:
2402 xor eax, eax
2403 /* pav = p - a = (a + b - c) - a = b - c */
2404 mov al, [esi + ebx] /* load Prior(x) into al */
2405 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
2406 sub eax, ecx /* subtract Prior(x-bpp) */
2407 mov patemp, eax /* Save pav for later use */
2408 xor eax, eax
2409 /* pbv = p - b = (a + b - c) - b = a - c */
2410 mov al, [edi + edx] /* load Raw(x-bpp) into al */
2411 sub eax, ecx /* subtract Prior(x-bpp) */
2412 mov ecx, eax
2413 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2414 add eax, patemp /* pcv = pav + pbv */
2415 /* pc = abs(pcv) */
2416 test eax, 0x80000000
2417 jz dpthpca
2418 neg eax /* reverse sign of neg values */
2419 dpthpca:
2420 mov pctemp, eax /* save pc for later use */
2421 /* pb = abs(pbv) */
2422 test ecx, 0x80000000
2423 jz dpthpba
2424 neg ecx /* reverse sign of neg values */
2425 dpthpba:
2426 mov pbtemp, ecx /* save pb for later use */
2427 /* pa = abs(pav) */
2428 mov eax, patemp
2429 test eax, 0x80000000
2430 jz dpthpaa
2431 neg eax /* reverse sign of neg values */
2432 dpthpaa:
2433 mov patemp, eax /* save pa for later use */
2434 /* test if pa <= pb */
2435 cmp eax, ecx
2436 jna dpthabb
2437 /* pa > pb; now test if pb <= pc */
2438 cmp ecx, pctemp
2439 jna dpthbbc
2440 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2441 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
2442 jmp dpthpaeth
2443 dpthbbc:
2444 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
2445 mov cl, [esi + ebx] /* load Prior(x) into cl */
2446 jmp dpthpaeth
2447 dpthabb:
2448 /* pa <= pb; now test if pa <= pc */
2449 cmp eax, pctemp
2450 jna dpthabc
2451 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
2452 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
2453 jmp dpthpaeth
2454 dpthabc:
2455 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
2456 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
2457 dpthpaeth:
2458 inc ebx
2459 inc edx
2460 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
2461 add [edi + ebx - 1], cl
2462 cmp ebx, diff
2463 jb dpthlp1
2464 dpthgo:
2465 mov ecx, FullLength
2466 mov eax, ecx
2467 sub eax, ebx /* subtract alignment fix */
2468 and eax, 0x00000007 /* calc bytes over mult of 8 */
2469 sub ecx, eax /* drop over bytes from original length */
2470 mov MMXLength, ecx
2471 } /* end _asm block */
2472 /* Now do the math for the rest of the row */
2473 switch ( bpp )
2474 {
2475 case 3:
2476 {
2477 ActiveMask.use = 0x0000000000ffffff;
2478 ActiveMaskEnd.use = 0xffff000000000000;
2479 ShiftBpp.use = 24; /* == bpp(3) * 8 */
2480 ShiftRem.use = 40; /* == 64 - 24 */
2481 _asm
2482 {
2483 mov ebx, diff
2484 mov edi, row
2485 mov esi, prev_row
2486 pxor mm0, mm0
2487 /* PRIME the pump (load the first Raw(x-bpp) data set */
2488 movq mm1, [edi+ebx-8]
2489 dpth3lp:
2490 psrlq mm1, ShiftRem /* shift last 3 bytes to 1st 3 bytes */
2491 movq mm2, [esi + ebx] /* load b=Prior(x) */
2492 punpcklbw mm1, mm0 /* Unpack High bytes of a */
2493 movq mm3, [esi+ebx-8] /* Prep c=Prior(x-bpp) bytes */
2494 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2495 psrlq mm3, ShiftRem /* shift last 3 bytes to 1st 3 bytes */
2496 /* pav = p - a = (a + b - c) - a = b - c */
2497 movq mm4, mm2
2498 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2499 /* pbv = p - b = (a + b - c) - b = a - c */
2500 movq mm5, mm1
2501 psubw mm4, mm3
2502 pxor mm7, mm7
2503 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2504 movq mm6, mm4
2505 psubw mm5, mm3
2506
2507 /* pa = abs(p-a) = abs(pav) */
2508 /* pb = abs(p-b) = abs(pbv) */
2509 /* pc = abs(p-c) = abs(pcv) */
2510 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2511 paddw mm6, mm5
2512 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2513 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2514 psubw mm4, mm0
2515 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2516 psubw mm4, mm0
2517 psubw mm5, mm7
2518 pxor mm0, mm0
2519 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2520 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2521 psubw mm5, mm7
2522 psubw mm6, mm0
2523 /* test pa <= pb */
2524 movq mm7, mm4
2525 psubw mm6, mm0
2526 pcmpgtw mm7, mm5 /* pa > pb? */
2527 movq mm0, mm7
2528 /* use mm7 mask to merge pa & pb */
2529 pand mm5, mm7
2530 /* use mm0 mask copy to merge a & b */
2531 pand mm2, mm0
2532 pandn mm7, mm4
2533 pandn mm0, mm1
2534 paddw mm7, mm5
2535 paddw mm0, mm2
2536 /* test ((pa <= pb)? pa:pb) <= pc */
2537 pcmpgtw mm7, mm6 /* pab > pc? */
2538 pxor mm1, mm1
2539 pand mm3, mm7
2540 pandn mm7, mm0
2541 paddw mm7, mm3
2542 pxor mm0, mm0
2543 packuswb mm7, mm1
2544 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */
2545 pand mm7, ActiveMask
2546 movq mm2, mm3 /* load b=Prior(x) step 1 */
2547 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2548 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2549 movq [edi + ebx], mm7 /* write back updated value */
2550 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */
2551 /* Now do Paeth for 2nd set of bytes (3-5) */
2552 psrlq mm2, ShiftBpp /* load b=Prior(x) step 2 */
2553 punpcklbw mm1, mm0 /* Unpack High bytes of a */
2554 pxor mm7, mm7
2555 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2556 /* pbv = p - b = (a + b - c) - b = a - c */
2557 movq mm5, mm1
2558 /* pav = p - a = (a + b - c) - a = b - c */
2559 movq mm4, mm2
2560 psubw mm5, mm3
2561 psubw mm4, mm3
2562 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = */
2563 /* pav + pbv = pbv + pav */
2564 movq mm6, mm5
2565 paddw mm6, mm4
2566
2567 /* pa = abs(p-a) = abs(pav) */
2568 /* pb = abs(p-b) = abs(pbv) */
2569 /* pc = abs(p-c) = abs(pcv) */
2570 pcmpgtw mm0, mm5 /* Create mask pbv bytes < 0 */
2571 pcmpgtw mm7, mm4 /* Create mask pav bytes < 0 */
2572 pand mm0, mm5 /* Only pbv bytes < 0 in mm0 */
2573 pand mm7, mm4 /* Only pav bytes < 0 in mm7 */
2574 psubw mm5, mm0
2575 psubw mm4, mm7
2576 psubw mm5, mm0
2577 psubw mm4, mm7
2578 pxor mm0, mm0
2579 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2580 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2581 psubw mm6, mm0
2582 /* test pa <= pb */
2583 movq mm7, mm4
2584 psubw mm6, mm0
2585 pcmpgtw mm7, mm5 /* pa > pb? */
2586 movq mm0, mm7
2587 /* use mm7 mask to merge pa & pb */
2588 pand mm5, mm7
2589 /* use mm0 mask copy to merge a & b */
2590 pand mm2, mm0
2591 pandn mm7, mm4
2592 pandn mm0, mm1
2593 paddw mm7, mm5
2594 paddw mm0, mm2
2595 /* test ((pa <= pb)? pa:pb) <= pc */
2596 pcmpgtw mm7, mm6 /* pab > pc? */
2597 movq mm2, [esi + ebx] /* load b=Prior(x) */
2598 pand mm3, mm7
2599 pandn mm7, mm0
2600 pxor mm1, mm1
2601 paddw mm7, mm3
2602 pxor mm0, mm0
2603 packuswb mm7, mm1
2604 movq mm3, mm2 /* load c=Prior(x-bpp) step 1 */
2605 pand mm7, ActiveMask
2606 punpckhbw mm2, mm0 /* Unpack High bytes of b */
2607 psllq mm7, ShiftBpp /* Shift bytes to 2nd group of 3 bytes */
2608 /* pav = p - a = (a + b - c) - a = b - c */
2609 movq mm4, mm2
2610 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2611 psllq mm3, ShiftBpp /* load c=Prior(x-bpp) step 2 */
2612 movq [edi + ebx], mm7 /* write back updated value */
2613 movq mm1, mm7
2614 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2615 psllq mm1, ShiftBpp /* Shift bytes */
2616 /* Now mm1 will be used as Raw(x-bpp) */
2617 /* Now do Paeth for 3rd, and final, set of bytes (6-7) */
2618 pxor mm7, mm7
2619 punpckhbw mm1, mm0 /* Unpack High bytes of a */
2620 psubw mm4, mm3
2621 /* pbv = p - b = (a + b - c) - b = a - c */
2622 movq mm5, mm1
2623 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2624 movq mm6, mm4
2625 psubw mm5, mm3
2626 pxor mm0, mm0
2627 paddw mm6, mm5
2628
2629 /* pa = abs(p-a) = abs(pav) */
2630 /* pb = abs(p-b) = abs(pbv) */
2631 /* pc = abs(p-c) = abs(pcv) */
2632 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2633 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2634 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2635 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2636 psubw mm4, mm0
2637 psubw mm5, mm7
2638 psubw mm4, mm0
2639 psubw mm5, mm7
2640 pxor mm0, mm0
2641 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2642 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2643 psubw mm6, mm0
2644 /* test pa <= pb */
2645 movq mm7, mm4
2646 psubw mm6, mm0
2647 pcmpgtw mm7, mm5 /* pa > pb? */
2648 movq mm0, mm7
2649 /* use mm0 mask copy to merge a & b */
2650 pand mm2, mm0
2651 /* use mm7 mask to merge pa & pb */
2652 pand mm5, mm7
2653 pandn mm0, mm1
2654 pandn mm7, mm4
2655 paddw mm0, mm2
2656 paddw mm7, mm5
2657 /* test ((pa <= pb)? pa:pb) <= pc */
2658 pcmpgtw mm7, mm6 /* pab > pc? */
2659 pand mm3, mm7
2660 pandn mm7, mm0
2661 paddw mm7, mm3
2662 pxor mm1, mm1
2663 packuswb mm1, mm7
2664 /* Step ebx to next set of 8 bytes and repeat loop til done */
2665 add ebx, 8
2666 pand mm1, ActiveMaskEnd
2667 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
2668
2669 cmp ebx, MMXLength
2670 pxor mm0, mm0 /* pxor does not affect flags */
2671 movq [edi + ebx - 8], mm1 /* write back updated value */
2672 /* mm1 will be used as Raw(x-bpp) next loop */
2673 /* mm3 ready to be used as Prior(x-bpp) next loop */
2674 jb dpth3lp
2675 } /* end _asm block */
2676 }
2677 break;
2678
2679 case 6:
2680 case 7:
2681 case 5:
2682 {
2683 ActiveMask.use = 0x00000000ffffffff;
2684 ActiveMask2.use = 0xffffffff00000000;
2685 ShiftBpp.use = bpp << 3; /* == bpp * 8 */
2686 ShiftRem.use = 64 - ShiftBpp.use;
2687 _asm
2688 {
2689 mov ebx, diff
2690 mov edi, row
2691 mov esi, prev_row
2692 /* PRIME the pump (load the first Raw(x-bpp) data set */
2693 movq mm1, [edi+ebx-8]
2694 pxor mm0, mm0
2695 dpth6lp:
2696 /* Must shift to position Raw(x-bpp) data */
2697 psrlq mm1, ShiftRem
2698 /* Do first set of 4 bytes */
2699 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2700 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2701 movq mm2, [esi + ebx] /* load b=Prior(x) */
2702 punpcklbw mm2, mm0 /* Unpack Low bytes of b */
2703 /* Must shift to position Prior(x-bpp) data */
2704 psrlq mm3, ShiftRem
2705 /* pav = p - a = (a + b - c) - a = b - c */
2706 movq mm4, mm2
2707 punpcklbw mm3, mm0 /* Unpack Low bytes of c */
2708 /* pbv = p - b = (a + b - c) - b = a - c */
2709 movq mm5, mm1
2710 psubw mm4, mm3
2711 pxor mm7, mm7
2712 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2713 movq mm6, mm4
2714 psubw mm5, mm3
2715 /* pa = abs(p-a) = abs(pav) */
2716 /* pb = abs(p-b) = abs(pbv) */
2717 /* pc = abs(p-c) = abs(pcv) */
2718 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2719 paddw mm6, mm5
2720 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2721 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2722 psubw mm4, mm0
2723 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2724 psubw mm4, mm0
2725 psubw mm5, mm7
2726 pxor mm0, mm0
2727 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2728 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2729 psubw mm5, mm7
2730 psubw mm6, mm0
2731 /* test pa <= pb */
2732 movq mm7, mm4
2733 psubw mm6, mm0
2734 pcmpgtw mm7, mm5 /* pa > pb? */
2735 movq mm0, mm7
2736 /* use mm7 mask to merge pa & pb */
2737 pand mm5, mm7
2738 /* use mm0 mask copy to merge a & b */
2739 pand mm2, mm0
2740 pandn mm7, mm4
2741 pandn mm0, mm1
2742 paddw mm7, mm5
2743 paddw mm0, mm2
2744 /* test ((pa <= pb)? pa:pb) <= pc */
2745 pcmpgtw mm7, mm6 /* pab > pc? */
2746 pxor mm1, mm1
2747 pand mm3, mm7
2748 pandn mm7, mm0
2749 paddw mm7, mm3
2750 pxor mm0, mm0
2751 packuswb mm7, mm1
2752 movq mm3, [esi + ebx - 8] /* load c=Prior(x-bpp) */
2753 pand mm7, ActiveMask
2754 psrlq mm3, ShiftRem
2755 movq mm2, [esi + ebx] /* load b=Prior(x) step 1 */
2756 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2757 movq mm6, mm2
2758 movq [edi + ebx], mm7 /* write back updated value */
2759 movq mm1, [edi+ebx-8]
2760 psllq mm6, ShiftBpp
2761 movq mm5, mm7
2762 psrlq mm1, ShiftRem
2763 por mm3, mm6
2764 psllq mm5, ShiftBpp
2765 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2766 por mm1, mm5
2767 /* Do second set of 4 bytes */
2768 punpckhbw mm2, mm0 /* Unpack High bytes of b */
2769 punpckhbw mm1, mm0 /* Unpack High bytes of a */
2770 /* pav = p - a = (a + b - c) - a = b - c */
2771 movq mm4, mm2
2772 /* pbv = p - b = (a + b - c) - b = a - c */
2773 movq mm5, mm1
2774 psubw mm4, mm3
2775 pxor mm7, mm7
2776 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2777 movq mm6, mm4
2778 psubw mm5, mm3
2779 /* pa = abs(p-a) = abs(pav) */
2780 /* pb = abs(p-b) = abs(pbv) */
2781 /* pc = abs(p-c) = abs(pcv) */
2782 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2783 paddw mm6, mm5
2784 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2785 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2786 psubw mm4, mm0
2787 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2788 psubw mm4, mm0
2789 psubw mm5, mm7
2790 pxor mm0, mm0
2791 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2792 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2793 psubw mm5, mm7
2794 psubw mm6, mm0
2795 /* test pa <= pb */
2796 movq mm7, mm4
2797 psubw mm6, mm0
2798 pcmpgtw mm7, mm5 /* pa > pb? */
2799 movq mm0, mm7
2800 /* use mm7 mask to merge pa & pb */
2801 pand mm5, mm7
2802 /* use mm0 mask copy to merge a & b */
2803 pand mm2, mm0
2804 pandn mm7, mm4
2805 pandn mm0, mm1
2806 paddw mm7, mm5
2807 paddw mm0, mm2
2808 /* test ((pa <= pb)? pa:pb) <= pc */
2809 pcmpgtw mm7, mm6 /* pab > pc? */
2810 pxor mm1, mm1
2811 pand mm3, mm7
2812 pandn mm7, mm0
2813 pxor mm1, mm1
2814 paddw mm7, mm3
2815 pxor mm0, mm0
2816 /* Step ex to next set of 8 bytes and repeat loop til done */
2817 add ebx, 8
2818 packuswb mm1, mm7
2819 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
2820 cmp ebx, MMXLength
2821 movq [edi + ebx - 8], mm1 /* write back updated value */
2822 /* mm1 will be used as Raw(x-bpp) next loop */
2823 jb dpth6lp
2824 } /* end _asm block */
2825 }
2826 break;
2827
2828 case 4:
2829 {
2830 ActiveMask.use = 0x00000000ffffffff;
2831 _asm {
2832 mov ebx, diff
2833 mov edi, row
2834 mov esi, prev_row
2835 pxor mm0, mm0
2836 /* PRIME the pump (load the first Raw(x-bpp) data set */
2837 movq mm1, [edi+ebx-8] /* Only time should need to read */
2838 /* a=Raw(x-bpp) bytes */
2839 dpth4lp:
2840 /* Do first set of 4 bytes */
2841 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2842 punpckhbw mm1, mm0 /* Unpack Low bytes of a */
2843 movq mm2, [esi + ebx] /* load b=Prior(x) */
2844 punpcklbw mm2, mm0 /* Unpack High bytes of b */
2845 /* pav = p - a = (a + b - c) - a = b - c */
2846 movq mm4, mm2
2847 punpckhbw mm3, mm0 /* Unpack High bytes of c */
2848 /* pbv = p - b = (a + b - c) - b = a - c */
2849 movq mm5, mm1
2850 psubw mm4, mm3
2851 pxor mm7, mm7
2852 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2853 movq mm6, mm4
2854 psubw mm5, mm3
2855 /* pa = abs(p-a) = abs(pav) */
2856 /* pb = abs(p-b) = abs(pbv) */
2857 /* pc = abs(p-c) = abs(pcv) */
2858 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2859 paddw mm6, mm5
2860 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2861 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2862 psubw mm4, mm0
2863 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2864 psubw mm4, mm0
2865 psubw mm5, mm7
2866 pxor mm0, mm0
2867 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2868 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2869 psubw mm5, mm7
2870 psubw mm6, mm0
2871 /* test pa <= pb */
2872 movq mm7, mm4
2873 psubw mm6, mm0
2874 pcmpgtw mm7, mm5 /* pa > pb? */
2875 movq mm0, mm7
2876 /* use mm7 mask to merge pa & pb */
2877 pand mm5, mm7
2878 /* use mm0 mask copy to merge a & b */
2879 pand mm2, mm0
2880 pandn mm7, mm4
2881 pandn mm0, mm1
2882 paddw mm7, mm5
2883 paddw mm0, mm2
2884 /* test ((pa <= pb)? pa:pb) <= pc */
2885 pcmpgtw mm7, mm6 /* pab > pc? */
2886 pxor mm1, mm1
2887 pand mm3, mm7
2888 pandn mm7, mm0
2889 paddw mm7, mm3
2890 pxor mm0, mm0
2891 packuswb mm7, mm1
2892 movq mm3, [esi + ebx] /* load c=Prior(x-bpp) */
2893 pand mm7, ActiveMask
2894 movq mm2, mm3 /* load b=Prior(x) step 1 */
2895 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
2896 punpcklbw mm3, mm0 /* Unpack High bytes of c */
2897 movq [edi + ebx], mm7 /* write back updated value */
2898 movq mm1, mm7 /* Now mm1 will be used as Raw(x-bpp) */
2899 /* Do second set of 4 bytes */
2900 punpckhbw mm2, mm0 /* Unpack Low bytes of b */
2901 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2902 /* pav = p - a = (a + b - c) - a = b - c */
2903 movq mm4, mm2
2904 /* pbv = p - b = (a + b - c) - b = a - c */
2905 movq mm5, mm1
2906 psubw mm4, mm3
2907 pxor mm7, mm7
2908 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2909 movq mm6, mm4
2910 psubw mm5, mm3
2911 /* pa = abs(p-a) = abs(pav) */
2912 /* pb = abs(p-b) = abs(pbv) */
2913 /* pc = abs(p-c) = abs(pcv) */
2914 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2915 paddw mm6, mm5
2916 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2917 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2918 psubw mm4, mm0
2919 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2920 psubw mm4, mm0
2921 psubw mm5, mm7
2922 pxor mm0, mm0
2923 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2924 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
2925 psubw mm5, mm7
2926 psubw mm6, mm0
2927 /* test pa <= pb */
2928 movq mm7, mm4
2929 psubw mm6, mm0
2930 pcmpgtw mm7, mm5 /* pa > pb? */
2931 movq mm0, mm7
2932 /* use mm7 mask to merge pa & pb */
2933 pand mm5, mm7
2934 /* use mm0 mask copy to merge a & b */
2935 pand mm2, mm0
2936 pandn mm7, mm4
2937 pandn mm0, mm1
2938 paddw mm7, mm5
2939 paddw mm0, mm2
2940 /* test ((pa <= pb)? pa:pb) <= pc */
2941 pcmpgtw mm7, mm6 /* pab > pc? */
2942 pxor mm1, mm1
2943 pand mm3, mm7
2944 pandn mm7, mm0
2945 pxor mm1, mm1
2946 paddw mm7, mm3
2947 pxor mm0, mm0
2948 /* Step ex to next set of 8 bytes and repeat loop til done */
2949 add ebx, 8
2950 packuswb mm1, mm7
2951 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
2952 cmp ebx, MMXLength
2953 movq [edi + ebx - 8], mm1 /* write back updated value */
2954 /* mm1 will be used as Raw(x-bpp) next loop */
2955 jb dpth4lp
2956 } /* end _asm block */
2957 }
2958 break;
2959 case 8: /* bpp == 8 */
2960 {
2961 ActiveMask.use = 0x00000000ffffffff;
2962 _asm {
2963 mov ebx, diff
2964 mov edi, row
2965 mov esi, prev_row
2966 pxor mm0, mm0
2967 /* PRIME the pump (load the first Raw(x-bpp) data set */
2968 movq mm1, [edi+ebx-8] /* Only time should need to read */
2969 /* a=Raw(x-bpp) bytes */
2970 dpth8lp:
2971 /* Do first set of 4 bytes */
2972 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
2973 punpcklbw mm1, mm0 /* Unpack Low bytes of a */
2974 movq mm2, [esi + ebx] /* load b=Prior(x) */
2975 punpcklbw mm2, mm0 /* Unpack Low bytes of b */
2976 /* pav = p - a = (a + b - c) - a = b - c */
2977 movq mm4, mm2
2978 punpcklbw mm3, mm0 /* Unpack Low bytes of c */
2979 /* pbv = p - b = (a + b - c) - b = a - c */
2980 movq mm5, mm1
2981 psubw mm4, mm3
2982 pxor mm7, mm7
2983 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
2984 movq mm6, mm4
2985 psubw mm5, mm3
2986 /* pa = abs(p-a) = abs(pav) */
2987 /* pb = abs(p-b) = abs(pbv) */
2988 /* pc = abs(p-c) = abs(pcv) */
2989 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
2990 paddw mm6, mm5
2991 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
2992 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
2993 psubw mm4, mm0
2994 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
2995 psubw mm4, mm0
2996 psubw mm5, mm7
2997 pxor mm0, mm0
2998 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
2999 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
3000 psubw mm5, mm7
3001 psubw mm6, mm0
3002 /* test pa <= pb */
3003 movq mm7, mm4
3004 psubw mm6, mm0
3005 pcmpgtw mm7, mm5 /* pa > pb? */
3006 movq mm0, mm7
3007 /* use mm7 mask to merge pa & pb */
3008 pand mm5, mm7
3009 /* use mm0 mask copy to merge a & b */
3010 pand mm2, mm0
3011 pandn mm7, mm4
3012 pandn mm0, mm1
3013 paddw mm7, mm5
3014 paddw mm0, mm2
3015 /* test ((pa <= pb)? pa:pb) <= pc */
3016 pcmpgtw mm7, mm6 /* pab > pc? */
3017 pxor mm1, mm1
3018 pand mm3, mm7
3019 pandn mm7, mm0
3020 paddw mm7, mm3
3021 pxor mm0, mm0
3022 packuswb mm7, mm1
3023 movq mm3, [esi+ebx-8] /* read c=Prior(x-bpp) bytes */
3024 pand mm7, ActiveMask
3025 movq mm2, [esi + ebx] /* load b=Prior(x) */
3026 paddb mm7, [edi + ebx] /* add Paeth predictor with Raw(x) */
3027 punpckhbw mm3, mm0 /* Unpack High bytes of c */
3028 movq [edi + ebx], mm7 /* write back updated value */
3029 movq mm1, [edi+ebx-8] /* read a=Raw(x-bpp) bytes */
3030
3031 /* Do second set of 4 bytes */
3032 punpckhbw mm2, mm0 /* Unpack High bytes of b */
3033 punpckhbw mm1, mm0 /* Unpack High bytes of a */
3034 /* pav = p - a = (a + b - c) - a = b - c */
3035 movq mm4, mm2
3036 /* pbv = p - b = (a + b - c) - b = a - c */
3037 movq mm5, mm1
3038 psubw mm4, mm3
3039 pxor mm7, mm7
3040 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3041 movq mm6, mm4
3042 psubw mm5, mm3
3043 /* pa = abs(p-a) = abs(pav) */
3044 /* pb = abs(p-b) = abs(pbv) */
3045 /* pc = abs(p-c) = abs(pcv) */
3046 pcmpgtw mm0, mm4 /* Create mask pav bytes < 0 */
3047 paddw mm6, mm5
3048 pand mm0, mm4 /* Only pav bytes < 0 in mm7 */
3049 pcmpgtw mm7, mm5 /* Create mask pbv bytes < 0 */
3050 psubw mm4, mm0
3051 pand mm7, mm5 /* Only pbv bytes < 0 in mm0 */
3052 psubw mm4, mm0
3053 psubw mm5, mm7
3054 pxor mm0, mm0
3055 pcmpgtw mm0, mm6 /* Create mask pcv bytes < 0 */
3056 pand mm0, mm6 /* Only pav bytes < 0 in mm7 */
3057 psubw mm5, mm7
3058 psubw mm6, mm0
3059 /* test pa <= pb */
3060 movq mm7, mm4
3061 psubw mm6, mm0
3062 pcmpgtw mm7, mm5 /* pa > pb? */
3063 movq mm0, mm7
3064 /* use mm7 mask to merge pa & pb */
3065 pand mm5, mm7
3066 /* use mm0 mask copy to merge a & b */
3067 pand mm2, mm0
3068 pandn mm7, mm4
3069 pandn mm0, mm1
3070 paddw mm7, mm5
3071 paddw mm0, mm2
3072 /* test ((pa <= pb)? pa:pb) <= pc */
3073 pcmpgtw mm7, mm6 /* pab > pc? */
3074 pxor mm1, mm1
3075 pand mm3, mm7
3076 pandn mm7, mm0
3077 pxor mm1, mm1
3078 paddw mm7, mm3
3079 pxor mm0, mm0
3080 /* Step ex to next set of 8 bytes and repeat loop til done */
3081 add ebx, 8
3082 packuswb mm1, mm7
3083 paddb mm1, [edi + ebx - 8] /* add Paeth predictor with Raw(x) */
3084 cmp ebx, MMXLength
3085 movq [edi + ebx - 8], mm1 /* write back updated value */
3086 /* mm1 will be used as Raw(x-bpp) next loop */
3087 jb dpth8lp
3088 } /* end _asm block */
3089 }
3090 break;
3091
3092 case 1: /* bpp = 1 */
3093 case 2: /* bpp = 2 */
3094 default: /* bpp > 8 */
3095 {
3096 _asm {
3097 mov ebx, diff
3098 cmp ebx, FullLength
3099 jnb dpthdend
3100 mov edi, row
3101 mov esi, prev_row
3102 /* Do Paeth decode for remaining bytes */
3103 mov edx, ebx
3104 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
3105 sub edx, bpp /* Set edx = ebx - bpp */
3106 dpthdlp:
3107 xor eax, eax
3108 /* pav = p - a = (a + b - c) - a = b - c */
3109 mov al, [esi + ebx] /* load Prior(x) into al */
3110 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3111 sub eax, ecx /* subtract Prior(x-bpp) */
3112 mov patemp, eax /* Save pav for later use */
3113 xor eax, eax
3114 /* pbv = p - b = (a + b - c) - b = a - c */
3115 mov al, [edi + edx] /* load Raw(x-bpp) into al */
3116 sub eax, ecx /* subtract Prior(x-bpp) */
3117 mov ecx, eax
3118 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3119 add eax, patemp /* pcv = pav + pbv */
3120 /* pc = abs(pcv) */
3121 test eax, 0x80000000
3122 jz dpthdpca
3123 neg eax /* reverse sign of neg values */
3124 dpthdpca:
3125 mov pctemp, eax /* save pc for later use */
3126 /* pb = abs(pbv) */
3127 test ecx, 0x80000000
3128 jz dpthdpba
3129 neg ecx /* reverse sign of neg values */
3130 dpthdpba:
3131 mov pbtemp, ecx /* save pb for later use */
3132 /* pa = abs(pav) */
3133 mov eax, patemp
3134 test eax, 0x80000000
3135 jz dpthdpaa
3136 neg eax /* reverse sign of neg values */
3137 dpthdpaa:
3138 mov patemp, eax /* save pa for later use */
3139 /* test if pa <= pb */
3140 cmp eax, ecx
3141 jna dpthdabb
3142 /* pa > pb; now test if pb <= pc */
3143 cmp ecx, pctemp
3144 jna dpthdbbc
3145 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3146 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3147 jmp dpthdpaeth
3148 dpthdbbc:
3149 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3150 mov cl, [esi + ebx] /* load Prior(x) into cl */
3151 jmp dpthdpaeth
3152 dpthdabb:
3153 /* pa <= pb; now test if pa <= pc */
3154 cmp eax, pctemp
3155 jna dpthdabc
3156 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3157 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3158 jmp dpthdpaeth
3159 dpthdabc:
3160 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3161 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
3162 dpthdpaeth:
3163 inc ebx
3164 inc edx
3165 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3166 add [edi + ebx - 1], cl
3167 cmp ebx, FullLength
3168 jb dpthdlp
3169 dpthdend:
3170 } /* end _asm block */
3171 }
3172 return; /* No need to go further with this one */
3173 } /* end switch ( bpp ) */
3174 _asm
3175 {
3176 /* MMX acceleration complete now do clean-up */
3177 /* Check if any remaining bytes left to decode */
3178 mov ebx, MMXLength
3179 cmp ebx, FullLength
3180 jnb dpthend
3181 mov edi, row
3182 mov esi, prev_row
3183 /* Do Paeth decode for remaining bytes */
3184 mov edx, ebx
3185 xor ecx, ecx /* zero ecx before using cl & cx in loop below */
3186 sub edx, bpp /* Set edx = ebx - bpp */
3187 dpthlp2:
3188 xor eax, eax
3189 /* pav = p - a = (a + b - c) - a = b - c */
3190 mov al, [esi + ebx] /* load Prior(x) into al */
3191 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3192 sub eax, ecx /* subtract Prior(x-bpp) */
3193 mov patemp, eax /* Save pav for later use */
3194 xor eax, eax
3195 /* pbv = p - b = (a + b - c) - b = a - c */
3196 mov al, [edi + edx] /* load Raw(x-bpp) into al */
3197 sub eax, ecx /* subtract Prior(x-bpp) */
3198 mov ecx, eax
3199 /* pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv */
3200 add eax, patemp /* pcv = pav + pbv */
3201 /* pc = abs(pcv) */
3202 test eax, 0x80000000
3203 jz dpthpca2
3204 neg eax /* reverse sign of neg values */
3205 dpthpca2:
3206 mov pctemp, eax /* save pc for later use */
3207 /* pb = abs(pbv) */
3208 test ecx, 0x80000000
3209 jz dpthpba2
3210 neg ecx /* reverse sign of neg values */
3211 dpthpba2:
3212 mov pbtemp, ecx /* save pb for later use */
3213 /* pa = abs(pav) */
3214 mov eax, patemp
3215 test eax, 0x80000000
3216 jz dpthpaa2
3217 neg eax /* reverse sign of neg values */
3218 dpthpaa2:
3219 mov patemp, eax /* save pa for later use */
3220 /* test if pa <= pb */
3221 cmp eax, ecx
3222 jna dpthabb2
3223 /* pa > pb; now test if pb <= pc */
3224 cmp ecx, pctemp
3225 jna dpthbbc2
3226 /* pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3227 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3228 jmp dpthpaeth2
3229 dpthbbc2:
3230 /* pb <= pc; Raw(x) = Paeth(x) + Prior(x) */
3231 mov cl, [esi + ebx] /* load Prior(x) into cl */
3232 jmp dpthpaeth2
3233 dpthabb2:
3234 /* pa <= pb; now test if pa <= pc */
3235 cmp eax, pctemp
3236 jna dpthabc2
3237 /* pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp) */
3238 mov cl, [esi + edx] /* load Prior(x-bpp) into cl */
3239 jmp dpthpaeth2
3240 dpthabc2:
3241 /* pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp) */
3242 mov cl, [edi + edx] /* load Raw(x-bpp) into cl */
3243 dpthpaeth2:
3244 inc ebx
3245 inc edx
3246 /* Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256 */
3247 add [edi + ebx - 1], cl
3248 cmp ebx, FullLength
3249 jb dpthlp2
3250 dpthend:
3251 emms /* End MMX instructions; prep for possible FP instrs. */
3252 } /* end _asm block */
3253 }
3254
3255 /* Optimized code for PNG Sub filter decoder */
3256 void /* PRIVATE */
3257 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
3258 {
3259 /*int test; */
3260 int bpp;
3261 png_uint_32 FullLength;
3262 png_uint_32 MMXLength;
3263 int diff;
3264
3265 bpp = (row_info->pixel_depth + 7) >> 3; /* Get # bytes per pixel */
3266 FullLength = row_info->rowbytes - bpp; /* # of bytes to filter */
3267 _asm {
3268 mov edi, row
3269 mov esi, edi /* lp = row */
3270 add edi, bpp /* rp = row + bpp */
3271 xor eax, eax
3272 /* get # of bytes to alignment */
3273 mov diff, edi /* take start of row */
3274 add diff, 0xf /* add 7 + 8 to incr past */
3275 /* alignment boundary */
3276 xor ebx, ebx
3277 and diff, 0xfffffff8 /* mask to alignment boundary */
3278 sub diff, edi /* subtract from start ==> value */
3279 /* ebx at alignment */
3280 jz dsubgo
3281 /* fix alignment */
3282 dsublp1:
3283 mov al, [esi+ebx]
3284 add [edi+ebx], al
3285 inc ebx
3286 cmp ebx, diff
3287 jb dsublp1
3288 dsubgo:
3289 mov ecx, FullLength
3290 mov edx, ecx
3291 sub edx, ebx /* subtract alignment fix */
3292 and edx, 0x00000007 /* calc bytes over mult of 8 */
3293 sub ecx, edx /* drop over bytes from length */
3294 mov MMXLength, ecx
3295 } /* end _asm block */
3296
3297 /* Now do the math for the rest of the row */
3298 switch ( bpp )
3299 {
3300 case 3:
3301 {
3302 ActiveMask.use = 0x0000ffffff000000;
3303 ShiftBpp.use = 24; /* == 3 * 8 */
3304 ShiftRem.use = 40; /* == 64 - 24 */
3305 _asm {
3306 mov edi, row
3307 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */
3308 mov esi, edi /* lp = row */
3309 add edi, bpp /* rp = row + bpp */
3310 movq mm6, mm7
3311 mov ebx, diff
3312 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */
3313 /* byte group */
3314 /* PRIME the pump (load the first Raw(x-bpp) data set */
3315 movq mm1, [edi+ebx-8]
3316 dsub3lp:
3317 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3318 /* no need for mask; shift clears inactive bytes */
3319 /* Add 1st active group */
3320 movq mm0, [edi+ebx]
3321 paddb mm0, mm1
3322 /* Add 2nd active group */
3323 movq mm1, mm0 /* mov updated Raws to mm1 */
3324 psllq mm1, ShiftBpp /* shift data to position correctly */
3325 pand mm1, mm7 /* mask to use only 2nd active group */
3326 paddb mm0, mm1
3327 /* Add 3rd active group */
3328 movq mm1, mm0 /* mov updated Raws to mm1 */
3329 psllq mm1, ShiftBpp /* shift data to position correctly */
3330 pand mm1, mm6 /* mask to use only 3rd active group */
3331 add ebx, 8
3332 paddb mm0, mm1
3333 cmp ebx, MMXLength
3334 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */
3335 /* Prep for doing 1st add at top of loop */
3336 movq mm1, mm0
3337 jb dsub3lp
3338 } /* end _asm block */
3339 }
3340 break;
3341
3342 case 1:
3343 {
3344 /* Placed here just in case this is a duplicate of the */
3345 /* non-MMX code for the SUB filter in png_read_filter_row below */
3346 //
3347 /* png_bytep rp; */
3348 /* png_bytep lp; */
3349 /* png_uint_32 i; */
3350 /* bpp = (row_info->pixel_depth + 7) >> 3; */
3351 /* for (i = (png_uint_32)bpp, rp = row + bpp, lp = row; */
3352 /* i < row_info->rowbytes; i++, rp++, lp++) */
3353 /* { */
3354 /* *rp = (png_byte)(((int)(*rp) + (int)(*lp)) & 0xff); */
3355 /* } */
3356 _asm {
3357 mov ebx, diff
3358 mov edi, row
3359 cmp ebx, FullLength
3360 jnb dsub1end
3361 mov esi, edi /* lp = row */
3362 xor eax, eax
3363 add edi, bpp /* rp = row + bpp */
3364 dsub1lp:
3365 mov al, [esi+ebx]
3366 add [edi+ebx], al
3367 inc ebx
3368 cmp ebx, FullLength
3369 jb dsub1lp
3370 dsub1end:
3371 } /* end _asm block */
3372 }
3373 return;
3374
3375 case 6:
3376 case 7:
3377 case 4:
3378 case 5:
3379 {
3380 ShiftBpp.use = bpp << 3;
3381 ShiftRem.use = 64 - ShiftBpp.use;
3382 _asm {
3383 mov edi, row
3384 mov ebx, diff
3385 mov esi, edi /* lp = row */
3386 add edi, bpp /* rp = row + bpp */
3387 /* PRIME the pump (load the first Raw(x-bpp) data set */
3388 movq mm1, [edi+ebx-8]
3389 dsub4lp:
3390 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3391 /* no need for mask; shift clears inactive bytes */
3392 movq mm0, [edi+ebx]
3393 paddb mm0, mm1
3394 /* Add 2nd active group */
3395 movq mm1, mm0 /* mov updated Raws to mm1 */
3396 psllq mm1, ShiftBpp /* shift data to position correctly */
3397 /* there is no need for any mask */
3398 /* since shift clears inactive bits/bytes */
3399 add ebx, 8
3400 paddb mm0, mm1
3401 cmp ebx, MMXLength
3402 movq [edi+ebx-8], mm0
3403 movq mm1, mm0 /* Prep for doing 1st add at top of loop */
3404 jb dsub4lp
3405 } /* end _asm block */
3406 }
3407 break;
3408
3409 case 2:
3410 {
3411 ActiveMask.use = 0x00000000ffff0000;
3412 ShiftBpp.use = 16; /* == 2 * 8 */
3413 ShiftRem.use = 48; /* == 64 - 16 */
3414 _asm {
3415 movq mm7, ActiveMask /* Load ActiveMask for 2nd active byte group */
3416 mov ebx, diff
3417 movq mm6, mm7
3418 mov edi, row
3419 psllq mm6, ShiftBpp /* Move mask in mm6 to cover 3rd active */
3420 /* byte group */
3421 mov esi, edi /* lp = row */
3422 movq mm5, mm6
3423 add edi, bpp /* rp = row + bpp */
3424 psllq mm5, ShiftBpp /* Move mask in mm5 to cover 4th active */
3425 /* byte group */
3426 /* PRIME the pump (load the first Raw(x-bpp) data set */
3427 movq mm1, [edi+ebx-8]
3428 dsub2lp:
3429 /* Add 1st active group */
3430 psrlq mm1, ShiftRem /* Shift data for adding 1st bpp bytes */
3431 /* no need for mask; shift clears inactive */
3432 /* bytes */
3433 movq mm0, [edi+ebx]
3434 paddb mm0, mm1
3435 /* Add 2nd active group */
3436 movq mm1, mm0 /* mov updated Raws to mm1 */
3437 psllq mm1, ShiftBpp /* shift data to position correctly */
3438 pand mm1, mm7 /* mask to use only 2nd active group */
3439 paddb mm0, mm1
3440 /* Add 3rd active group */
3441 movq mm1, mm0 /* mov updated Raws to mm1 */
3442 psllq mm1, ShiftBpp /* shift data to position correctly */
3443 pand mm1, mm6 /* mask to use only 3rd active group */
3444 paddb mm0, mm1
3445 /* Add 4th active group */
3446 movq mm1, mm0 /* mov updated Raws to mm1 */
3447 psllq mm1, ShiftBpp /* shift data to position correctly */
3448 pand mm1, mm5 /* mask to use only 4th active group */
3449 add ebx, 8
3450 paddb mm0, mm1
3451 cmp ebx, MMXLength
3452 movq [edi+ebx-8], mm0 /* Write updated Raws back to array */
3453 movq mm1, mm0 /* Prep for doing 1st add at top of loop */
3454 jb dsub2lp
3455 } /* end _asm block */
3456 }
3457 break;
3458 case 8:
3459 {
3460 _asm {
3461 mov edi, row
3462 mov ebx, diff
3463 mov esi, edi /* lp = row */
3464 add edi, bpp /* rp = row + bpp */
3465 mov ecx, MMXLength
3466 movq mm7, [edi+ebx-8] /* PRIME the pump (load the first */
3467 /* Raw(x-bpp) data set */
3468 and ecx, 0x0000003f /* calc bytes over mult of 64 */
3469 dsub8lp:
3470 movq mm0, [edi+ebx] /* Load Sub(x) for 1st 8 bytes */
3471 paddb mm0, mm7
3472 movq mm1, [edi+ebx+8] /* Load Sub(x) for 2nd 8 bytes */
3473 movq [edi+ebx], mm0 /* Write Raw(x) for 1st 8 bytes */
3474 /* Now mm0 will be used as Raw(x-bpp) for */
3475 /* the 2nd group of 8 bytes. This will be */
3476 /* repeated for each group of 8 bytes with */
3477 /* the 8th group being used as the Raw(x-bpp) */
3478 /* for the 1st group of the next loop. */
3479 paddb mm1, mm0
3480 movq mm2, [edi+ebx+16] /* Load Sub(x) for 3rd 8 bytes */
3481 movq [edi+ebx+8], mm1 /* Write Raw(x) for 2nd 8 bytes */
3482 paddb mm2, mm1
3483 movq mm3, [edi+ebx+24] /* Load Sub(x) for 4th 8 bytes */
3484 movq [edi+ebx+16], mm2 /* Write Raw(x) for 3rd 8 bytes */
3485 paddb mm3, mm2
3486 movq mm4, [edi+ebx+32] /* Load Sub(x) for 5th 8 bytes */
3487 movq [edi+ebx+24], mm3 /* Write Raw(x) for 4th 8 bytes */
3488 paddb mm4, mm3
3489 movq mm5, [edi+ebx+40] /* Load Sub(x) for 6th 8 bytes */
3490 movq [edi+ebx+32], mm4 /* Write Raw(x) for 5th 8 bytes */
3491 paddb mm5, mm4
3492 movq mm6, [edi+ebx+48] /* Load Sub(x) for 7th 8 bytes */
3493 movq [edi+ebx+40], mm5 /* Write Raw(x) for 6th 8 bytes */
3494 paddb mm6, mm5
3495 movq mm7, [edi+ebx+56] /* Load Sub(x) for 8th 8 bytes */
3496 movq [edi+ebx+48], mm6 /* Write Raw(x) for 7th 8 bytes */
3497 add ebx, 64
3498 paddb mm7, mm6
3499 cmp ebx, ecx
3500 movq [edi+ebx-8], mm7 /* Write Raw(x) for 8th 8 bytes */
3501 jb dsub8lp
3502 cmp ebx, MMXLength
3503 jnb dsub8lt8
3504 dsub8lpA:
3505 movq mm0, [edi+ebx]
3506 add ebx, 8
3507 paddb mm0, mm7
3508 cmp ebx, MMXLength
3509 movq [edi+ebx-8], mm0 /* use -8 to offset early add to ebx */
3510 movq mm7, mm0 /* Move calculated Raw(x) data to mm1 to */
3511 /* be the new Raw(x-bpp) for the next loop */
3512 jb dsub8lpA
3513 dsub8lt8:
3514 } /* end _asm block */
3515 }
3516 break;
3517
3518 default: /* bpp greater than 8 bytes */
3519 {
3520 _asm {
3521 mov ebx, diff
3522 mov edi, row
3523 mov esi, edi /* lp = row */
3524 add edi, bpp /* rp = row + bpp */
3525 dsubAlp:
3526 movq mm0, [edi+ebx]
3527 movq mm1, [esi+ebx]
3528 add ebx, 8
3529 paddb mm0, mm1
3530 cmp ebx, MMXLength
3531 movq [edi+ebx-8], mm0 /* mov does not affect flags; -8 to offset */
3532 /* add ebx */
3533 jb dsubAlp
3534 } /* end _asm block */
3535 }
3536 break;
3537
3538 } /* end switch ( bpp ) */
3539
3540 _asm {
3541 mov ebx, MMXLength
3542 mov edi, row
3543 cmp ebx, FullLength
3544 jnb dsubend
3545 mov esi, edi /* lp = row */
3546 xor eax, eax
3547 add edi, bpp /* rp = row + bpp */
3548 dsublp2:
3549 mov al, [esi+ebx]
3550 add [edi+ebx], al
3551 inc ebx
3552 cmp ebx, FullLength
3553 jb dsublp2
3554 dsubend:
3555 emms /* End MMX instructions; prep for possible FP instrs. */
3556 } /* end _asm block */
3557 }
3558
3559 /* Optimized code for PNG Up filter decoder */
3560 void /* PRIVATE */
3561 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
3562 png_bytep prev_row)
3563 {
3564 png_uint_32 len;
3565 len = row_info->rowbytes; /* # of bytes to filter */
3566 _asm {
3567 mov edi, row
3568 /* get # of bytes to alignment */
3569 mov ecx, edi
3570 xor ebx, ebx
3571 add ecx, 0x7
3572 xor eax, eax
3573 and ecx, 0xfffffff8
3574 mov esi, prev_row
3575 sub ecx, edi
3576 jz dupgo
3577 /* fix alignment */
3578 duplp1:
3579 mov al, [edi+ebx]
3580 add al, [esi+ebx]
3581 inc ebx
3582 cmp ebx, ecx
3583 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
3584 jb duplp1
3585 dupgo:
3586 mov ecx, len
3587 mov edx, ecx
3588 sub edx, ebx /* subtract alignment fix */
3589 and edx, 0x0000003f /* calc bytes over mult of 64 */
3590 sub ecx, edx /* drop over bytes from length */
3591 /* Unrolled loop - use all MMX registers and interleave to reduce */
3592 /* number of branch instructions (loops) and reduce partial stalls */
3593 duploop:
3594 movq mm1, [esi+ebx]
3595 movq mm0, [edi+ebx]
3596 movq mm3, [esi+ebx+8]
3597 paddb mm0, mm1
3598 movq mm2, [edi+ebx+8]
3599 movq [edi+ebx], mm0
3600 paddb mm2, mm3
3601 movq mm5, [esi+ebx+16]
3602 movq [edi+ebx+8], mm2
3603 movq mm4, [edi+ebx+16]
3604 movq mm7, [esi+ebx+24]
3605 paddb mm4, mm5
3606 movq mm6, [edi+ebx+24]
3607 movq [edi+ebx+16], mm4
3608 paddb mm6, mm7
3609 movq mm1, [esi+ebx+32]
3610 movq [edi+ebx+24], mm6
3611 movq mm0, [edi+ebx+32]
3612 movq mm3, [esi+ebx+40]
3613 paddb mm0, mm1
3614 movq mm2, [edi+ebx+40]
3615 movq [edi+ebx+32], mm0
3616 paddb mm2, mm3
3617 movq mm5, [esi+ebx+48]
3618 movq [edi+ebx+40], mm2
3619 movq mm4, [edi+ebx+48]
3620 movq mm7, [esi+ebx+56]
3621 paddb mm4, mm5
3622 movq mm6, [edi+ebx+56]
3623 movq [edi+ebx+48], mm4
3624 add ebx, 64
3625 paddb mm6, mm7
3626 cmp ebx, ecx
3627 movq [edi+ebx-8], mm6 /* (+56)movq does not affect flags; */
3628 /* -8 to offset add ebx */
3629 jb duploop
3630
3631 cmp edx, 0 /* Test for bytes over mult of 64 */
3632 jz dupend
3633
3634
3635 /* 2 lines added by lcreeve@netins.net */
3636 /* (mail 11 Jul 98 in png-implement list) */
3637 cmp edx, 8 /*test for less than 8 bytes */
3638 jb duplt8
3639
3640
3641 add ecx, edx
3642 and edx, 0x00000007 /* calc bytes over mult of 8 */
3643 sub ecx, edx /* drop over bytes from length */
3644 jz duplt8
3645 /* Loop using MMX registers mm0 & mm1 to update 8 bytes simultaneously */
3646 duplpA:
3647 movq mm1, [esi+ebx]
3648 movq mm0, [edi+ebx]
3649 add ebx, 8
3650 paddb mm0, mm1
3651 cmp ebx, ecx
3652 movq [edi+ebx-8], mm0 /* movq does not affect flags; -8 to offset add ebx */
3653 jb duplpA
3654 cmp edx, 0 /* Test for bytes over mult of 8 */
3655 jz dupend
3656 duplt8:
3657 xor eax, eax
3658 add ecx, edx /* move over byte count into counter */
3659 /* Loop using x86 registers to update remaining bytes */
3660 duplp2:
3661 mov al, [edi + ebx]
3662 add al, [esi + ebx]
3663 inc ebx
3664 cmp ebx, ecx
3665 mov [edi + ebx-1], al /* mov does not affect flags; -1 to offset inc ebx */
3666 jb duplp2
3667 dupend:
3668 /* Conversion of filtered row completed */
3669 emms /* End MMX instructions; prep for possible FP instrs. */
3670 } /* end _asm block */
3671 }
3672
3673
3674 /* Optimized png_read_filter_row routines */
3675 void /* PRIVATE */
3676 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
3677 row, png_bytep prev_row, int filter)
3678 {
3679 #ifdef PNG_DEBUG
3680 char filnm[10];
3681 #endif
3682
3683 if (mmx_supported == 2) {
3684 #if !defined(PNG_1_0_X)
3685 /* this should have happened in png_init_mmx_flags() already */
3686 png_warning(png_ptr, "asm_flags may not have been initialized");
3687 #endif
3688 png_mmx_support();
3689 }
3690
3691 #ifdef PNG_DEBUG
3692 png_debug(1, "in png_read_filter_row\n");
3693 switch (filter)
3694 {
3695 case 0: sprintf(filnm, "none");
3696 break;
3697 #if !defined(PNG_1_0_X)
3698 case 1: sprintf(filnm, "sub-%s",
3699 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : "x86");
3700 break;
3701 case 2: sprintf(filnm, "up-%s",
3702 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" : "x86");
3703 break;
3704 case 3: sprintf(filnm, "avg-%s",
3705 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" : "x86");
3706 break;
3707 case 4: sprintf(filnm, "Paeth-%s",
3708 (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":"x86");
3709 break;
3710 #else
3711 case 1: sprintf(filnm, "sub");
3712 break;
3713 case 2: sprintf(filnm, "up");
3714 break;
3715 case 3: sprintf(filnm, "avg");
3716 break;
3717 case 4: sprintf(filnm, "Paeth");
3718 break;
3719 #endif
3720 default: sprintf(filnm, "unknw");
3721 break;
3722 }
3723 png_debug2(0,"row=%5d, %s, ", png_ptr->row_number, filnm);
3724 png_debug2(0, "pd=%2d, b=%d, ", (int)row_info->pixel_depth,
3725 (int)((row_info->pixel_depth + 7) >> 3));
3726 png_debug1(0,"len=%8d, ", row_info->rowbytes);
3727 #endif /* PNG_DEBUG */
3728
3729 switch (filter)
3730 {
3731 case PNG_FILTER_VALUE_NONE:
3732 break;
3733
3734 case PNG_FILTER_VALUE_SUB:
3735 {
3736 #if !defined(PNG_1_0_X)
3737 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
3738 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3739 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3740 #else
3741 if (mmx_supported)
3742 #endif
3743 {
3744 png_read_filter_row_mmx_sub(row_info, row);
3745 }
3746 else
3747 {
3748 png_uint_32 i;
3749 png_uint_32 istop = row_info->rowbytes;
3750 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3751 png_bytep rp = row + bpp;
3752 png_bytep lp = row;
3753
3754 for (i = bpp; i < istop; i++)
3755 {
3756 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
3757 rp++;
3758 }
3759 }
3760 break;
3761 }
3762
3763 case PNG_FILTER_VALUE_UP:
3764 {
3765 #if !defined(PNG_1_0_X)
3766 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
3767 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3768 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3769 #else
3770 if (mmx_supported)
3771 #endif
3772 {
3773 png_read_filter_row_mmx_up(row_info, row, prev_row);
3774 }
3775 else
3776 {
3777 png_uint_32 i;
3778 png_uint_32 istop = row_info->rowbytes;
3779 png_bytep rp = row;
3780 png_bytep pp = prev_row;
3781
3782 for (i = 0; i < istop; ++i)
3783 {
3784 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3785 rp++;
3786 }
3787 }
3788 break;
3789 }
3790
3791 case PNG_FILTER_VALUE_AVG:
3792 {
3793 #if !defined(PNG_1_0_X)
3794 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
3795 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3796 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3797 #else
3798 if (mmx_supported)
3799 #endif
3800 {
3801 png_read_filter_row_mmx_avg(row_info, row, prev_row);
3802 }
3803 else
3804 {
3805 png_uint_32 i;
3806 png_bytep rp = row;
3807 png_bytep pp = prev_row;
3808 png_bytep lp = row;
3809 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3810 png_uint_32 istop = row_info->rowbytes - bpp;
3811
3812 for (i = 0; i < bpp; i++)
3813 {
3814 *rp = (png_byte)(((int)(*rp) +
3815 ((int)(*pp++) >> 1)) & 0xff);
3816 rp++;
3817 }
3818
3819 for (i = 0; i < istop; i++)
3820 {
3821 *rp = (png_byte)(((int)(*rp) +
3822 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
3823 rp++;
3824 }
3825 }
3826 break;
3827 }
3828
3829 case PNG_FILTER_VALUE_PAETH:
3830 {
3831 #if !defined(PNG_1_0_X)
3832 if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
3833 (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
3834 (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
3835 #else
3836 if (mmx_supported)
3837 #endif
3838 {
3839 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
3840 }
3841 else
3842 {
3843 png_uint_32 i;
3844 png_bytep rp = row;
3845 png_bytep pp = prev_row;
3846 png_bytep lp = row;
3847 png_bytep cp = prev_row;
3848 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
3849 png_uint_32 istop=row_info->rowbytes - bpp;
3850
3851 for (i = 0; i < bpp; i++)
3852 {
3853 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
3854 rp++;
3855 }
3856
3857 for (i = 0; i < istop; i++) /* use leftover rp,pp */
3858 {
3859 int a, b, c, pa, pb, pc, p;
3860
3861 a = *lp++;
3862 b = *pp++;
3863 c = *cp++;
3864
3865 p = b - c;
3866 pc = a - c;
3867
3868 #ifdef PNG_USE_ABS
3869 pa = abs(p);
3870 pb = abs(pc);
3871 pc = abs(p + pc);
3872 #else
3873 pa = p < 0 ? -p : p;
3874 pb = pc < 0 ? -pc : pc;
3875 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
3876 #endif
3877
3878 /*
3879 if (pa <= pb && pa <= pc)
3880 p = a;
3881 else if (pb <= pc)
3882 p = b;
3883 else
3884 p = c;
3885 */
3886
3887 p = (pa <= pb && pa <=pc) ? a : (pb <= pc) ? b : c;
3888
3889 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
3890 rp++;
3891 }
3892 }
3893 break;
3894 }
3895
3896 default:
3897 png_warning(png_ptr, "Ignoring bad row filter type");
3898 *row=0;
3899 break;
3900 }
3901 }
3902
3903 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGVCRD */