]> git.saurik.com Git - apple/libc.git/blob - arm/string/NEON/bzero.s
Libc-594.9.5.tar.gz
[apple/libc.git] / arm / string / NEON / bzero.s
1 /*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24 /**********************************************************************
25 * Cortex-A8 implementation *
26 **********************************************************************/
27
28 // Cortex-A8 implementations of memset( ) and bzero( ). Main loop is 64-byte
29 // NEON stores, unless the buffer length is > 1k. Beyond that point, there is
30 // little to no speed advantage with NEON (and a slight regression in some
31 // measured cases), so we switch to the GPRs.
32 //
33 // The crossover point should be reevaluated for future architectures.
34 //
35 // -- Stephen Canon, August 2009
36
37 .text
38 .syntax unified
39 .code 16
40
41 // void bzero(void * destination,
42 // size_t length);
43 //
44 // zeros out a buffer length bytes long, beginning at the address destination.
45 .thumb_func ___bzero
46 .globl ___bzero
47 .thumb_func _bzero
48 .globl _bzero
49 .align 2
50 ___bzero:
51 _bzero:
52 mov r2, r1 // match the API to memset(dest, 0, length)
53 eor r1, r1 // and fall through into memset
54
55 // void *memset(void * destination,
56 // int value, size_t n);
57 //
58 // writes value converted to an unsigned char to n successive bytes, beginning
59 // at destination.
60
61 // Notes on register usage:
62 //
63 // Throughout this function, registers have nearly constant usage; the pattern
64 // is:
65 //
66 // r0 holds the original destination pointer, unmodified. This value
67 // must be returned by the routine, so it is easiest to just leave it
68 // in place.
69 // r1 holds the value that is being copied into the buffer, in some stage
70 // of splattedness. The low byte is guaranteed to always have the value
71 // but the higher bytes may or may not contain copies of it.
72 // r2 holds the length minus some offset, where the offset is always the
73 // number of bytes that the current loop stores per iteration.
74 // r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
75 // copies of the value to be stored.
76 // ip holds a pointer to the lowest byte in the array that has not yet been
77 // set to hold value.
78 // q0 and q1 hold splatted copies of the value in the vector path, and are
79 // otherwise unused.
80
81 .thumb_func _memset
82 .globl _memset
83 .align 2
84 _memset:
85 mov ip, r0 // copy destination pointer.
86 subs r2, #0x8 // if length - 8 is negative (i.e. length
87 and r1, #0xff // is less than 8), jump to cleanup path.
88 blt L_scalarCleanup //
89
90 tst ip, #0x7 // if the destination is doubleword
91 beq L_vectorCopy // aligned, jump to fast path.
92
93 0: strb r1, [ip], #1 // store one byte at a time until
94 sub r2, #1 // destination pointer is 8 byte aligned.
95 tst ip, #7 //
96 bne 0b //
97
98 cmp r2, #0x0 // if length - 8 is negative,
99 blt L_scalarCleanup // jump to the cleanup code
100
101 L_vectorCopy:
102 vdup.8 q0, r1 // splat the byte to be stored across
103 subs r2, #0x38 // q0 and q1, and check if length - 64
104 vmov q1, q0 // is negative; if so, jump to the
105 blt L_vectorCleanup // cleanup code.
106
107 tst ip, #0x38 // if the destination is cacheline
108 beq L_cachelineAligned // aligned, jump to the fast path.
109
110 0: vst1.64 {d0}, [ip, :64]! // store one double word at a time until
111 sub r2, #8 // the destination is 64-byte aligned
112 tst ip, #0x38 //
113 bne 0b
114
115 cmp r2, #0x0 // if length - 64 is negative,
116 blt L_vectorCleanup // jump to the cleanup code
117
118 L_cachelineAligned:
119 cmp r2, #0x3c0 // if length > 1024
120 bge L_useSTMIA // we use stmia instead
121
122 .align 4 // main loop
123 0: vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes
124 subs r2, #0x40 // decrement length by 64
125 vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes
126 bge 0b // if length - 64 >= 0, continue
127
128 L_vectorCleanup:
129 adds r2, #0x38 // if (length - 8) < 0, goto scalar cleanup
130 blt L_scalarCleanup //
131
132 0: subs r2, #8 // store one double word at a time until
133 vst1.64 {d0}, [ip, :64]! // (length - 8) < 0.
134 bge 0b
135
136 L_scalarCleanup:
137 adds r2, #8 // restore length
138 beq 1f // early out if zero.
139
140 0: strb r1, [ip], #1 // store one byte at a time until length
141 subs r2, #1 // is zero.
142 bne 0b //
143 1: bx lr // return.
144
145 // STMIA loop for large buffers
146 //
147 // For stores larger than 1024 bytes, we use STMIA because we can't get enough
148 // of a speedup from NEON to offset the higher power draw of the NEON unit.
149 //
150 // This crossover should be reevaluated on future architectures.
151 //
152 // We avoid using r7 and r9 even though it's not strictly necessary.
153
154 L_useSTMIA:
155 push {r4,r5,r6,r8,r10,r11}
156 orr r1, r1, r1, lsl #8
157 orr r1, r1, r1, lsl #16
158 mov r3, r1
159 mov r4, r1
160 mov r5, r1
161 mov r6, r1
162 mov r8, r1
163 mov r10, r1
164 mov r11, r1
165 .align 4
166 0: stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11}
167 subs r2, #0x40
168 stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11}
169 bge 0b
170 pop {r4,r5,r6,r8,r10,r11}
171 b L_vectorCleanup