Remove obsolete VisualAge-related files.
[wxWidgets.git] / src / png / arm / filter_neon.S
1
2 /* filter_neon.S - NEON optimised filter functions
3  *
4  * Copyright (c) 2011 Glenn Randers-Pehrson
5  * Written by Mans Rullgard, 2011.
6  * Last changed in libpng 1.5.7 [December 15, 2011]
7  *
8  * This code is released under the libpng license.
9  * For conditions of distribution and use, see the disclaimer
10  * and license in png.h
11  */
12
13 /* This is required to get the symbol renames, which are #defines, and also
14  * includes the value of PNG_FILTER_OPTIMIZATIONS.
15  */
16 #define PNG_VERSION_INFO_ONLY
17 #include "../pngpriv.h"
18
19 #if defined(PNG_FILTER_OPTIMIZATIONS) && defined(__arm__) && \
20    defined(__ARM_NEON__)
21 #if defined(__linux__) && defined(__ELF__)
22 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
23 #endif
24
25 #ifdef __ELF__
26 #   define ELF
27 #else
28 #   define ELF @
29 #endif
30
31         .arch armv7-a
32         .fpu  neon
33
34 .macro  func    name, export=0
35     .macro endfunc
36 ELF     .size   \name, . - \name
37         .endfunc
38         .purgem endfunc
39     .endm
40         .text
41     .if \export
42         .global \name
43     .endif
44 ELF     .type   \name, STT_FUNC
45         .func   \name
46 \name:
47 .endm
48
49 func    png_read_filter_row_sub4_neon, export=1
50         ldr             r3,  [r0, #4]           @ rowbytes
51         vmov.i8         d3,  #0
52 1:
53         vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
54         vadd.u8         d0,  d3,  d4
55         vadd.u8         d1,  d0,  d5
56         vadd.u8         d2,  d1,  d6
57         vadd.u8         d3,  d2,  d7
58         vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
59         subs            r3,  r3,  #16
60         bgt             1b
61
62         bx              lr
63 endfunc
64
65 func    png_read_filter_row_sub3_neon, export=1
66         ldr             r3,  [r0, #4]           @ rowbytes
67         vmov.i8         d3,  #0
68         mov             r0,  r1
69         mov             r2,  #3
70         mov             r12, #12
71         vld1.8          {q11},    [r0], r12
72 1:
73         vext.8          d5,  d22, d23, #3
74         vadd.u8         d0,  d3,  d22
75         vext.8          d6,  d22, d23, #6
76         vadd.u8         d1,  d0,  d5
77         vext.8          d7,  d23, d23, #1
78         vld1.8          {q11},    [r0], r12
79         vst1.32         {d0[0]},  [r1,:32], r2
80         vadd.u8         d2,  d1,  d6
81         vst1.32         {d1[0]},  [r1], r2
82         vadd.u8         d3,  d2,  d7
83         vst1.32         {d2[0]},  [r1], r2
84         vst1.32         {d3[0]},  [r1], r2
85         subs            r3,  r3,  #12
86         bgt             1b
87
88         bx              lr
89 endfunc
90
91 func    png_read_filter_row_up_neon, export=1
92         ldr             r3,  [r0, #4]           @ rowbytes
93 1:
94         vld1.8          {q0}, [r1,:128]
95         vld1.8          {q1}, [r2,:128]!
96         vadd.u8         q0,  q0,  q1
97         vst1.8          {q0}, [r1,:128]!
98         subs            r3,  r3,  #16
99         bgt             1b
100
101         bx              lr
102 endfunc
103
104 func    png_read_filter_row_avg4_neon, export=1
105         ldr             r12, [r0, #4]           @ rowbytes
106         vmov.i8         d3,  #0
107 1:
108         vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
109         vld4.32         {d16[],d17[],d18[],d19[]},[r2,:128]!
110         vhadd.u8        d0,  d3,  d16
111         vadd.u8         d0,  d0,  d4
112         vhadd.u8        d1,  d0,  d17
113         vadd.u8         d1,  d1,  d5
114         vhadd.u8        d2,  d1,  d18
115         vadd.u8         d2,  d2,  d6
116         vhadd.u8        d3,  d2,  d19
117         vadd.u8         d3,  d3,  d7
118         vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
119         subs            r12, r12, #16
120         bgt             1b
121
122         bx              lr
123 endfunc
124
125 func    png_read_filter_row_avg3_neon, export=1
126         push            {r4,lr}
127         ldr             r12, [r0, #4]           @ rowbytes
128         vmov.i8         d3,  #0
129         mov             r0,  r1
130         mov             r4,  #3
131         mov             lr,  #12
132         vld1.8          {q11},    [r0], lr
133 1:
134         vld1.8          {q10},    [r2], lr
135         vext.8          d5,  d22, d23, #3
136         vhadd.u8        d0,  d3,  d20
137         vext.8          d17, d20, d21, #3
138         vadd.u8         d0,  d0,  d22
139         vext.8          d6,  d22, d23, #6
140         vhadd.u8        d1,  d0,  d17
141         vext.8          d18, d20, d21, #6
142         vadd.u8         d1,  d1,  d5
143         vext.8          d7,  d23, d23, #1
144         vld1.8          {q11},    [r0], lr
145         vst1.32         {d0[0]},  [r1,:32], r4
146         vhadd.u8        d2,  d1,  d18
147         vst1.32         {d1[0]},  [r1], r4
148         vext.8          d19, d21, d21, #1
149         vadd.u8         d2,  d2,  d6
150         vhadd.u8        d3,  d2,  d19
151         vst1.32         {d2[0]},  [r1], r4
152         vadd.u8         d3,  d3,  d7
153         vst1.32         {d3[0]},  [r1], r4
154         subs            r12, r12, #12
155         bgt             1b
156
157         pop             {r4,pc}
158 endfunc
159
160 .macro  paeth           rx,  ra,  rb,  rc
161         vaddl.u8        q12, \ra, \rb           @ a + b
162         vaddl.u8        q15, \rc, \rc           @ 2*c
163         vabdl.u8        q13, \rb, \rc           @ pa
164         vabdl.u8        q14, \ra, \rc           @ pb
165         vabd.u16        q15, q12, q15           @ pc
166         vcle.u16        q12, q13, q14           @ pa <= pb
167         vcle.u16        q13, q13, q15           @ pa <= pc
168         vcle.u16        q14, q14, q15           @ pb <= pc
169         vand            q12, q12, q13           @ pa <= pb && pa <= pc
170         vmovn.u16       d28, q14
171         vmovn.u16       \rx, q12
172         vbsl            d28, \rb, \rc
173         vbsl            \rx, \ra, d28
174 .endm
175
176 func    png_read_filter_row_paeth4_neon, export=1
177         ldr             r12, [r0, #4]           @ rowbytes
178         vmov.i8         d3,  #0
179         vmov.i8         d20, #0
180 1:
181         vld4.32         {d4[],d5[],d6[],d7[]},    [r1,:128]
182         vld4.32         {d16[],d17[],d18[],d19[]},[r2,:128]!
183         paeth           d0,  d3,  d16, d20
184         vadd.u8         d0,  d0,  d4
185         paeth           d1,  d0,  d17, d16
186         vadd.u8         d1,  d1,  d5
187         paeth           d2,  d1,  d18, d17
188         vadd.u8         d2,  d2,  d6
189         paeth           d3,  d2,  d19, d18
190         vmov            d20, d19
191         vadd.u8         d3,  d3,  d7
192         vst4.32         {d0[0],d1[0],d2[0],d3[0]},[r1,:128]!
193         subs            r12, r12, #16
194         bgt             1b
195
196         bx              lr
197 endfunc
198
199 func    png_read_filter_row_paeth3_neon, export=1
200         push            {r4,lr}
201         ldr             r12, [r0, #4]           @ rowbytes
202         vmov.i8         d3,  #0
203         vmov.i8         d4,  #0
204         mov             r0,  r1
205         mov             r4,  #3
206         mov             lr,  #12
207         vld1.8          {q11},    [r0], lr
208 1:
209         vld1.8          {q10},    [r2], lr
210         paeth           d0,  d3,  d20, d4
211         vext.8          d5,  d22, d23, #3
212         vadd.u8         d0,  d0,  d22
213         vext.8          d17, d20, d21, #3
214         paeth           d1,  d0,  d17, d20
215         vst1.32         {d0[0]},  [r1,:32], r4
216         vext.8          d6,  d22, d23, #6
217         vadd.u8         d1,  d1,  d5
218         vext.8          d18, d20, d21, #6
219         paeth           d2,  d1,  d18, d17
220         vext.8          d7,  d23, d23, #1
221         vld1.8          {q11},    [r0], lr
222         vst1.32         {d1[0]},  [r1], r4
223         vadd.u8         d2,  d2,  d6
224         vext.8          d19, d21, d21, #1
225         paeth           d3,  d2,  d19, d18
226         vst1.32         {d2[0]},  [r1], r4
227         vmov            d4,  d19
228         vadd.u8         d3,  d3,  d7
229         vst1.32         {d3[0]},  [r1], r4
230         subs            r12, r12, #12
231         bgt             1b
232
233         pop             {r4,pc}
234 endfunc
235 #endif /* FILTER_OPTIMIZATIONS && __arm__ && __ARM_NEON__ */