]>
Commit | Line | Data |
---|---|---|
7b00c0c4 A |
1 | /* |
2 | * Copyright (c) 2009 Apple Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. Please obtain a copy of the License at | |
10 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
11 | * file. | |
12 | * | |
13 | * The Original Code and all software distributed under the License are | |
14 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
15 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
16 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
17 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
18 | * Please see the License for the specific language governing rights and | |
19 | * limitations under the License. | |
20 | * | |
21 | * @APPLE_LICENSE_HEADER_END@ | |
22 | */ | |
ad3c9f2a A |
23 | |
24 | #include <arm/arch.h> | |
25 | #if defined _ARM_ARCH_7 && !defined VARIANT_DYLD | |
26 | ||
27 | /********************************************************************** | |
28 | * Cortex-A8 implementation * | |
29 | **********************************************************************/ | |
30 | ||
31 | // Cortex-A8 implementations of memset( ) and bzero( ). Main loop is 64-byte | |
32 | // NEON stores, unless the buffer length is > 1k. Beyond that point, there is | |
33 | // little to no speed advantage with NEON (and a slight regression in some | |
34 | // measured cases), so we switch to the GPRs. | |
35 | // | |
36 | // The crossover point should be reevaluated for future architectures. | |
37 | // | |
38 | // -- Stephen Canon, August 2009 | |
39 | ||
40 | .text | |
41 | .syntax unified | |
42 | .code 16 | |
43 | ||
44 | // void bzero(void * destination, | |
45 | // size_t length); | |
46 | // | |
47 | // zeros out a buffer length bytes long, beginning at the address destination. | |
48 | .thumb_func ___bzero$VARIANT$CortexA8 | |
49 | .globl ___bzero$VARIANT$CortexA8 | |
50 | .thumb_func _bzero$VARIANT$CortexA8 | |
51 | .globl _bzero$VARIANT$CortexA8 | |
52 | .align 2 | |
53 | ___bzero$VARIANT$CortexA8: | |
54 | _bzero$VARIANT$CortexA8: | |
55 | mov r2, r1 // match the API to memset(dest, 0, length) | |
56 | eor r1, r1 // and fall through into memset | |
57 | ||
58 | // void *memset(void * destination, | |
59 | // int value, size_t n); | |
60 | // | |
61 | // writes value converted to an unsigned char to n successive bytes, beginning | |
62 | // at destination. | |
63 | ||
64 | // Notes on register usage: | |
65 | // | |
66 | // Throughout this function, registers have nearly constant usage; the pattern | |
67 | // is: | |
68 | // | |
69 | // r0 holds the original destination pointer, unmodified. This value | |
70 | // must be returned by the routine, so it is easiest to just leave it | |
71 | // in place. | |
72 | // r1 holds the value that is being copied into the buffer, in some stage | |
73 | // of splattedness. The low byte is guaranteed to always have the value | |
74 | // but the higher bytes may or may not contain copies of it. | |
75 | // r2 holds the length minus some offset, where the offset is always the | |
76 | // number of bytes that the current loop stores per iteration. | |
77 | // r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted | |
78 | // copies of the value to be stored. | |
79 | // ip holds a pointer to the lowest byte in the array that has not yet been | |
80 | // set to hold value. | |
81 | // q0 and q1 hold splatted copies of the value in the vector path, and are | |
82 | // otherwise unused. | |
83 | ||
84 | .thumb_func _memset$VARIANT$CortexA8 | |
85 | .globl _memset$VARIANT$CortexA8 | |
86 | .align 2 | |
87 | _memset$VARIANT$CortexA8: | |
88 | mov ip, r0 // copy destination pointer. | |
89 | subs r2, #0x8 // if length - 8 is negative (i.e. length | |
90 | and r1, #0xff // is less than 8), jump to cleanup path. | |
91 | blt L_scalarCleanup // | |
92 | ||
93 | tst ip, #0x7 // if the destination is doubleword | |
94 | beq L_vectorCopy // aligned, jump to fast path. | |
95 | ||
96 | 0: strb r1, [ip], #1 // store one byte at a time until | |
97 | sub r2, #1 // destination pointer is 8 byte aligned. | |
98 | tst ip, #7 // | |
99 | bne 0b // | |
100 | ||
101 | cmp r2, #0x0 // if length - 8 is negative, | |
102 | blt L_scalarCleanup // jump to the cleanup code | |
103 | ||
104 | L_vectorCopy: | |
105 | vdup.8 q0, r1 // splat the byte to be stored across | |
106 | subs r2, #0x38 // q0 and q1, and check if length - 64 | |
107 | vmov q1, q0 // is negative; if so, jump to the | |
108 | blt L_vectorCleanup // cleanup code. | |
109 | ||
110 | tst ip, #0x38 // if the destination is cacheline | |
111 | beq L_cachelineAligned // aligned, jump to the fast path. | |
112 | ||
113 | 0: vst1.64 {d0}, [ip, :64]! // store one double word at a time until | |
114 | sub r2, #8 // the destination is 64-byte aligned | |
115 | tst ip, #0x38 // | |
116 | bne 0b | |
117 | ||
118 | cmp r2, #0x0 // if length - 64 is negative, | |
119 | blt L_vectorCleanup // jump to the cleanup code | |
120 | ||
121 | L_cachelineAligned: | |
122 | cmp r2, #0x3c0 // if length > 1024 | |
123 | bge L_useSTMIA // we use stmia instead | |
124 | ||
125 | .align 4 // main loop | |
126 | 0: vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes | |
127 | subs r2, #0x40 // decrement length by 64 | |
128 | vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes | |
129 | bge 0b // if length - 64 >= 0, continue | |
130 | ||
131 | L_vectorCleanup: | |
132 | adds r2, #0x38 // if (length - 8) < 0, goto scalar cleanup | |
133 | blt L_scalarCleanup // | |
134 | ||
135 | 0: subs r2, #8 // store one double word at a time until | |
136 | vst1.64 {d0}, [ip, :64]! // (length - 8) < 0. | |
137 | bge 0b | |
138 | ||
139 | L_scalarCleanup: | |
140 | adds r2, #8 // restore length | |
141 | beq 1f // early out if zero. | |
142 | ||
143 | 0: strb r1, [ip], #1 // store one byte at a time until length | |
144 | subs r2, #1 // is zero. | |
145 | bne 0b // | |
146 | 1: bx lr // return. | |
147 | ||
148 | // STMIA loop for large buffers | |
149 | // | |
150 | // For stores larger than 1024 bytes, we use STMIA because we can't get enough | |
151 | // of a speedup from NEON to offset the higher power draw of the NEON unit. | |
152 | // | |
153 | // This crossover should be reevaluated on future architectures. | |
154 | // | |
155 | // We avoid using r7 and r9 even though it's not strictly necessary. | |
156 | ||
157 | L_useSTMIA: | |
158 | push {r4,r5,r6,r8,r10,r11} | |
159 | orr r1, r1, r1, lsl #8 | |
160 | orr r1, r1, r1, lsl #16 | |
161 | mov r3, r1 | |
162 | mov r4, r1 | |
163 | mov r5, r1 | |
164 | mov r6, r1 | |
165 | mov r8, r1 | |
166 | mov r10, r1 | |
167 | mov r11, r1 | |
168 | .align 4 | |
169 | 0: stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11} | |
170 | subs r2, #0x40 | |
171 | stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11} | |
172 | bge 0b | |
173 | pop {r4,r5,r6,r8,r10,r11} | |
174 | b L_vectorCleanup | |
175 | ||
176 | #endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD | |
177 |