]>
Commit | Line | Data |
---|---|---|
5b2abdfb | 1 | /* |
734aad71 | 2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. |
5b2abdfb A |
3 | * |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
734aad71 | 6 | * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. |
5b2abdfb | 7 | * |
734aad71 A |
8 | * This file contains Original Code and/or Modifications of Original Code |
9 | * as defined in and that are subject to the Apple Public Source License | |
10 | * Version 2.0 (the 'License'). You may not use this file except in | |
11 | * compliance with the License. Please obtain a copy of the License at | |
12 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
13 | * file. | |
14 | * | |
15 | * The Original Code and all software distributed under the License are | |
16 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
5b2abdfb A |
17 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
18 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
734aad71 A |
19 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
20 | * Please see the License for the specific language governing rights and | |
21 | * limitations under the License. | |
5b2abdfb A |
22 | * |
23 | * @APPLE_LICENSE_HEADER_END@ | |
24 | */ | |
734aad71 A |
25 | |
26 | #define __APPLE_API_PRIVATE | |
27 | #include <machine/cpu_capabilities.h> | |
28 | #undef __APPLE_API_PRIVATE | |
29 | ||
30 | // These functions have migrated to the comm page. | |
31 | ||
32 | .text | |
33 | .globl _bcopy | |
34 | .globl _memcpy | |
35 | .globl _memmove | |
36 | ||
37 | .align 5 | |
38 | _bcopy: // void bcopy(const void *src, void *dst, size_t len) | |
39 | ba _COMM_PAGE_BCOPY | |
40 | ||
41 | .align 5 | |
42 | _memcpy: // void* memcpy(void *dst, void *src, size_t len) | |
43 | _memmove: // void* memmove(void *dst, const void *src, size_t len) | |
44 | ba _COMM_PAGE_MEMCPY | |
45 | ||
46 | ||
47 | #if 0 | |
5b2abdfb A |
48 | /* ======================================= |
49 | * BCOPY, MEMCPY, and MEMMOVE for Mac OS X | |
50 | * ======================================= | |
51 | * | |
52 | * Version of 6/17/2002, for G3, G4, and G4+. | |
53 | * | |
54 | * There are many paths through this code, depending on length, reverse/forward, | |
55 | * processor type, and alignment. We use reverse paths only when the operands | |
56 | * overlap and the destination is higher than the source. They are not quite as | |
57 | * fast as the forward paths. | |
58 | * | |
59 | * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in | |
60 | * the inner loops for long operands. DST is less effective than DCBT, because it | |
61 | * can get out of sync with the inner loop. DCBTST is usually not a win, so we | |
62 | * don't use it except during initialization when we're not using the LSU. | |
63 | * We don't DCBT on G3, which only handles one load miss at a time. | |
64 | * | |
65 | * We don't use DCBZ, because it takes an alignment exception on uncached memory | |
66 | * like frame buffers. Bcopy to frame buffers must work. This hurts G3 in the | |
67 | * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.) | |
68 | * | |
69 | * Using DCBA on G4 is a tradeoff. For the cold-cache case it can be a big win, | |
70 | * since it avoids the read of destination cache lines. But for the hot-cache case | |
71 | * it is always slower, because of the cycles spent needlessly zeroing data. Some | |
72 | * machines store-gather and can cancel the read if all bytes of a line are stored, | |
73 | * others cannot. Unless explicitly told which is better, we time loops with and | |
74 | * without DCBA and use the fastest. Note that we never DCBA in reverse loops, | |
75 | * since by definition they are overlapped so dest lines will be in the cache. | |
76 | * | |
77 | * For longer operands we use an 8-element branch table, based on the CPU type, | |
78 | * to select the appropriate inner loop. The branch table is indexed as follows: | |
79 | * | |
80 | * bit 10000 set if a Reverse move is required | |
81 | * bits 01100 set on the relative operand alignment: 0=unaligned, 1=word, | |
82 | * 2=doubleword, and 3=quadword. | |
83 | * | |
84 | * By "relatively" n-byte aligned, we mean the source and destination are a multiple | |
85 | * of n bytes apart (they need not be absolutely aligned.) | |
86 | * | |
87 | * The branch table for the running CPU type is pointed to by LBranchTablePtr. | |
88 | * Initially, LBranchtablePtr points to G3's table, since that is the lowest | |
89 | * common denominator that will run on any CPU. Later, pthread initialization | |
90 | * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets | |
91 | * up the correct pointer for the running CPU. | |
92 | * | |
93 | * We distinguish between "short", "medium", and "long" operands: | |
94 | * short (<= 32 bytes) most common case, minimum path length is important | |
95 | * medium (> 32, < kLong) too short for Altivec or use of cache ops like DCBA | |
96 | * long (>= kLong) long enough for cache ops and to amortize use of Altivec | |
97 | * | |
98 | * WARNING: kLong must be >=96, due to implicit assumptions about operand length. | |
99 | */ | |
100 | #define kLong 96 | |
101 | ||
102 | /* Register usage. Note we use R2, so this code will not run in a PEF/CFM | |
103 | * environment. Note also the rather delicate way we assign multiple uses | |
104 | * to the same register. Beware. | |
105 | * | |
106 | * r0 = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16") | |
107 | * r2 = "w8" or VRSave ("rv") | |
108 | * r3 = not used, as memcpy and memmove return 1st parameter as a value | |
109 | * r4 = source ptr ("rs") | |
110 | * r5 = count of bytes to move ("rc") | |
111 | * r6 = "w1", "c16", or "cm17" | |
112 | * r7 = "w2", "c32", or "cm33" | |
113 | * r8 = "w3", "c48", or "cm49" | |
114 | * r9 = "w4", "c64", or "cm1" | |
115 | * r10 = "w5", "c96", or "cm97" | |
116 | * r11 = "w6", "c128", "cm129", or return address ("ra") | |
117 | * r12 = destination ptr ("rd") | |
118 | * f0-f8 = used for moving 8-byte aligned data | |
119 | * v0 = permute vector ("vp") | |
120 | * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4") | |
121 | * v5-v7 = permuted qw's ("vx", "vy", and "vz") | |
122 | */ | |
123 | #define rs r4 | |
124 | #define rd r12 | |
125 | #define rc r5 | |
126 | #define ra r11 | |
127 | #define rv r2 | |
128 | ||
129 | #define w1 r6 | |
130 | #define w2 r7 | |
131 | #define w3 r8 | |
132 | #define w4 r9 | |
133 | #define w5 r10 | |
134 | #define w6 r11 | |
135 | #define w7 r0 | |
136 | #define w8 r2 | |
137 | ||
138 | #define c16 r6 | |
139 | #define cm17 r6 | |
140 | #define c32 r7 | |
141 | #define cm33 r7 | |
142 | #define c48 r8 | |
143 | #define cm49 r8 | |
144 | #define c64 r9 | |
145 | #define cm1 r9 | |
146 | #define c96 r10 | |
147 | #define cm97 r10 | |
148 | #define c128 r11 | |
149 | #define cm129 r11 | |
150 | ||
151 | #define vp v0 | |
152 | #define vx v5 | |
153 | #define vy v6 | |
154 | #define vz v7 | |
155 | ||
156 | #define VRSave 256 | |
157 | ||
158 | #include <architecture/ppc/asm_help.h> | |
159 | ||
160 | // The branch tables, 8 entries per CPU type. | |
161 | // NB: we depend on 5 low-order 0s in the address of branch tables. | |
162 | ||
163 | .data | |
164 | .align 5 // must be 32-byte aligned | |
165 | ||
166 | // G3 (the default CPU type) | |
167 | ||
168 | LG3: | |
169 | .long LForwardWord // 000: forward, unaligned | |
170 | .long LForwardFloat // 001: forward, 4-byte aligned | |
171 | .long LForwardFloat // 010: forward, 8-byte aligned | |
172 | .long LForwardFloat // 011: forward, 16-byte aligned | |
173 | .long LReverseWord // 100: reverse, unaligned | |
174 | .long LReverseFloat // 101: reverse, 4-byte aligned | |
175 | .long LReverseFloat // 110: reverse, 8-byte aligned | |
176 | .long LReverseFloat // 111: reverse, 16-byte aligned | |
177 | ||
178 | // G4s that benefit from DCBA. | |
179 | ||
180 | LG4UseDcba: | |
181 | .long LForwardVecUnal32Dcba // 000: forward, unaligned | |
182 | .long LForwardVecUnal32Dcba // 001: forward, 4-byte aligned | |
183 | .long LForwardVecUnal32Dcba // 010: forward, 8-byte aligned | |
184 | .long LForwardVecAlig32Dcba // 011: forward, 16-byte aligned | |
185 | .long LReverseVectorUnal32 // 100: reverse, unaligned | |
186 | .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned | |
187 | .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned | |
188 | .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned | |
189 | ||
190 | // G4s that should not use DCBA. | |
191 | ||
192 | LG4NoDcba: | |
193 | .long LForwardVecUnal32NoDcba // 000: forward, unaligned | |
194 | .long LForwardVecUnal32NoDcba // 001: forward, 4-byte aligned | |
195 | .long LForwardVecUnal32NoDcba // 010: forward, 8-byte aligned | |
196 | .long LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned | |
197 | .long LReverseVectorUnal32 // 100: reverse, unaligned | |
198 | .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned | |
199 | .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned | |
200 | .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned | |
201 | ||
202 | ||
203 | // Pointer to the 8-element branch table for running CPU type: | |
204 | ||
205 | LBranchTablePtr: | |
206 | .long LG3 // default to G3 until "bcopy_initialize" called | |
207 | ||
208 | ||
209 | // The CPU capability vector, initialized in pthread_init(). | |
210 | // "_bcopy_initialize" uses this to set up LBranchTablePtr: | |
211 | ||
212 | .globl __cpu_capabilities | |
213 | __cpu_capabilities: | |
214 | .long 0 | |
215 | ||
216 | // Bit definitions for _cpu_capabilities: | |
217 | ||
218 | #define kHasAltivec 0x01 | |
219 | #define k64Bit 0x02 | |
220 | #define kCache32 0x04 | |
221 | #define kCache64 0x08 | |
222 | #define kCache128 0x10 | |
223 | #define kUseDcba 0x20 | |
224 | #define kNoDcba 0x40 | |
225 | ||
226 | ||
227 | .text | |
228 | .globl _bcopy | |
229 | .globl _memcpy | |
230 | .globl _memmove | |
231 | .globl __bcopy_initialize | |
232 | ||
233 | ||
234 | // Main entry points. | |
235 | ||
236 | .align 5 | |
237 | _bcopy: // void bcopy(const void *src, void *dst, size_t len) | |
238 | mr r10,r3 // reverse source and dest ptrs, to be like memcpy | |
239 | mr r3,r4 | |
240 | mr r4,r10 | |
241 | _memcpy: // void* memcpy(void *dst, void *src, size_t len) | |
242 | _memmove: // void* memmove(void *dst, const void *src, size_t len) | |
243 | cmplwi cr7,rc,32 // length <= 32 bytes? | |
244 | sub. w1,r3,rs // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst | |
245 | dcbt 0,rs // touch in the first line of source | |
246 | cmplw cr6,w1,rc // set cr6 blt iff we must move reverse | |
247 | cmplwi cr1,rc,kLong-1 // set cr1 bgt if long | |
248 | mr rd,r3 // must leave r3 alone, it is return value for memcpy etc | |
249 | bgt- cr7,LMedium // longer than 32 bytes | |
250 | dcbtst 0,rd // touch in destination | |
251 | beq- cr7,LMove32 // special case moves of 32 bytes | |
252 | blt- cr6,LShortReverse0 | |
253 | ||
254 | // Forward short operands. This is the most frequent case, so it is inline. | |
255 | // We also end up here to xfer the last 0-31 bytes of longer operands. | |
256 | ||
257 | LShort: // WARNING: can fall into this routine | |
258 | andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf) | |
259 | mtcrf 0x01,rc // move rest of length to cr7 | |
260 | beq 1f // quadword to move? | |
261 | lwz w1,0(rs) | |
262 | lwz w2,4(rs) | |
263 | lwz w3,8(rs) | |
264 | lwz w4,12(rs) | |
265 | addi rs,rs,16 | |
266 | stw w1,0(rd) | |
267 | stw w2,4(rd) | |
268 | stw w3,8(rd) | |
269 | stw w4,12(rd) | |
270 | addi rd,rd,16 | |
271 | 1: | |
272 | LShort16: // join here to xfer 0-15 bytes | |
273 | bf 28,2f // doubleword? | |
274 | lwz w1,0(rs) | |
275 | lwz w2,4(rs) | |
276 | addi rs,rs,8 | |
277 | stw w1,0(rd) | |
278 | stw w2,4(rd) | |
279 | addi rd,rd,8 | |
280 | 2: | |
281 | bf 29,3f // word? | |
282 | lwz w1,0(rs) | |
283 | addi rs,rs,4 | |
284 | stw w1,0(rd) | |
285 | addi rd,rd,4 | |
286 | 3: | |
287 | bf 30,4f // halfword to move? | |
288 | lhz w1,0(rs) | |
289 | addi rs,rs,2 | |
290 | sth w1,0(rd) | |
291 | addi rd,rd,2 | |
292 | 4: | |
293 | bflr 31 // skip if no odd byte | |
294 | lbz w1,0(rs) | |
295 | stb w1,0(rd) | |
296 | blr | |
297 | ||
298 | ||
299 | // Handle short reverse operands, up to kShort in length. | |
300 | // This is also used to transfer the last 0-31 bytes of longer operands. | |
301 | ||
302 | LShortReverse0: | |
303 | add rs,rs,rc // adjust ptrs for reverse move | |
304 | add rd,rd,rc | |
305 | LShortReverse: | |
306 | andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf) | |
307 | mtcrf 0x01,rc // move rest of length to cr7 | |
308 | beq 1f // quadword to move? | |
309 | lwz w1,-4(rs) | |
310 | lwz w2,-8(rs) | |
311 | lwz w3,-12(rs) | |
312 | lwzu w4,-16(rs) | |
313 | stw w1,-4(rd) | |
314 | stw w2,-8(rd) | |
315 | stw w3,-12(rd) | |
316 | stwu w4,-16(rd) | |
317 | 1: | |
318 | LShortReverse16: // join here to xfer 0-15 bytes and return | |
319 | bf 28,2f // doubleword? | |
320 | lwz w1,-4(rs) | |
321 | lwzu w2,-8(rs) | |
322 | stw w1,-4(rd) | |
323 | stwu w2,-8(rd | |
324 | 2: | |
325 | bf 29,3f // word? | |
326 | lwzu w1,-4(rs) | |
327 | stwu w1,-4(rd) | |
328 | 3: | |
329 | bf 30,4f // halfword to move? | |
330 | lhzu w1,-2(rs) | |
331 | sthu w1,-2(rd) | |
332 | 4: | |
333 | bflr 31 // done if no odd byte | |
334 | lbz w1,-1(rs) // no update | |
335 | stb w1,-1(rd) | |
336 | blr | |
337 | ||
338 | ||
339 | // Special case for 32-byte moves. Too long for LShort, too common for LMedium. | |
340 | ||
341 | LMove32: | |
342 | lwz w1,0(rs) | |
343 | lwz w2,4(rs) | |
344 | lwz w3,8(rs) | |
345 | lwz w4,12(rs) | |
346 | lwz w5,16(rs) | |
347 | lwz w6,20(rs) | |
348 | lwz w7,24(rs) | |
349 | lwz w8,28(rs) | |
350 | stw w1,0(rd) | |
351 | stw w2,4(rd) | |
352 | stw w3,8(rd) | |
353 | stw w4,12(rd) | |
354 | stw w5,16(rd) | |
355 | stw w6,20(rd) | |
356 | stw w7,24(rd) | |
357 | stw w8,28(rd) | |
358 | LExit: | |
359 | blr | |
360 | ||
361 | ||
362 | // Medium length operands (32 < rc < kLong.) These loops run on all CPUs, as the | |
363 | // operands are not long enough to bother with the branch table, using cache ops, or | |
364 | // Altivec. We word align the source, not the dest as we do for long operands, | |
365 | // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length | |
366 | // operands, and the opportunity to cancel reads of dest cache lines is limited. | |
367 | // w1 = (rd-rs), used to check for alignment | |
368 | // cr0 = set on (rd-rs) | |
369 | // cr1 = bgt if long operand | |
370 | // cr6 = blt if reverse move | |
371 | ||
372 | LMedium: | |
373 | dcbtst 0,rd // touch in 1st line of destination | |
374 | rlwinm r0,w1,0,29,31 // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned | |
375 | beq- LExit // early exit if (rs==rd), avoiding use of "beqlr" | |
376 | neg w2,rs // we align source, not dest, and assume forward | |
377 | cmpwi cr5,r0,0 // set cr5 beq if doubleword aligned | |
378 | bgt- cr1,LLong // handle long operands | |
379 | andi. w3,w2,3 // W3 <- #bytes to word-align source | |
380 | blt- cr6,LMediumReverse // handle reverse move | |
381 | lwz w1,0(rs) // pre-fetch first 4 bytes of source | |
382 | beq- cr5,LMediumAligned // operands are doubleword aligned | |
383 | sub rc,rc,w3 // adjust count for alignment | |
384 | mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShort16 | |
385 | srwi w4,rc,4 // w4 <- number of 16-byte chunks to xfer (>=1) | |
386 | mtctr w4 // prepare loop count | |
387 | beq+ 2f // source already aligned | |
388 | ||
389 | lwzx w2,w3,rs // get 1st aligned word (which we might partially overwrite) | |
390 | add rs,rs,w3 // word-align source ptr | |
391 | stw w1,0(rd) // store all (w3) bytes at once to avoid a loop | |
392 | add rd,rd,w3 | |
393 | mr w1,w2 // first aligned word to w1 | |
394 | b 2f | |
395 | ||
396 | .align 4 // align inner loops | |
397 | 1: // loop over 16-byte chunks | |
398 | lwz w1,0(rs) | |
399 | 2: | |
400 | lwz w2,4(rs) | |
401 | lwz w3,8(rs) | |
402 | lwz w4,12(rs) | |
403 | addi rs,rs,16 | |
404 | stw w1,0(rd) | |
405 | stw w2,4(rd) | |
406 | stw w3,8(rd) | |
407 | stw w4,12(rd) | |
408 | addi rd,rd,16 | |
409 | bdnz 1b | |
410 | ||
411 | b LShort16 | |
412 | ||
413 | ||
414 | // Medium, doubleword aligned. We use floating point. Note that G4+ has bigger latencies | |
415 | // and reduced throughput for floating pt loads and stores; future processors will probably | |
416 | // have even worse lfd/stfd performance. We use it here because it is so important for G3, | |
417 | // and not slower for G4+. But we only do so for doubleword aligned operands, whereas the | |
418 | // G3-only long operand loops use floating pt even for word-aligned operands. | |
419 | // w2 = neg(rs) | |
420 | // w1 = first 4 bytes of source | |
421 | ||
422 | LMediumAligned: | |
423 | andi. w3,w2,7 // already aligned? | |
424 | sub rc,rc,w3 // adjust count by 0-7 bytes | |
425 | lfdx f0,rs,w3 // pre-fetch first aligned source doubleword | |
426 | srwi w4,rc,5 // get count of 32-byte chunks (might be 0 if unaligned) | |
427 | mtctr w4 | |
428 | beq- LForwardFloatLoop1 // already aligned | |
429 | ||
430 | cmpwi w4,0 // are there any 32-byte chunks to xfer? | |
431 | lwz w2,4(rs) // get 2nd (unaligned) source word | |
432 | add rs,rs,w3 // doubleword align source pointer | |
433 | stw w1,0(rd) // store first 8 bytes of source to align... | |
434 | stw w2,4(rd) // ...which could overwrite source | |
435 | add rd,rd,w3 // doubleword align destination | |
436 | bne+ LForwardFloatLoop1 // at least 1 chunk, so enter loop | |
437 | ||
438 | subi rc,rc,8 // unfortunate degenerate case: no chunks to xfer | |
439 | stfd f0,0(rd) // must store f1 since source might have been overwriten | |
440 | addi rs,rs,8 | |
441 | addi rd,rd,8 | |
442 | b LShort | |
443 | ||
444 | ||
445 | // Medium reverse moves. This loop runs on all processors. | |
446 | ||
447 | LMediumReverse: | |
448 | add rs,rs,rc // point to other end of operands when in reverse | |
449 | add rd,rd,rc | |
450 | andi. w3,rs,3 // w3 <- #bytes to word align source | |
451 | lwz w1,-4(rs) // pre-fetch 1st 4 bytes of source | |
452 | sub rc,rc,w3 // adjust count | |
453 | srwi w4,rc,4 // get count of 16-byte chunks (>=1) | |
454 | mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShortReverse16 | |
455 | mtctr w4 // prepare loop count | |
456 | beq+ 2f // source already aligned | |
457 | ||
458 | sub rs,rs,w3 // word-align source ptr | |
459 | lwz w2,-4(rs) // get 1st aligned word which we may overwrite | |
460 | stw w1,-4(rd) // store all 4 bytes to align without a loop | |
461 | sub rd,rd,w3 | |
462 | mr w1,w2 // shift 1st aligned source word to w1 | |
463 | b 2f | |
464 | ||
465 | 1: | |
466 | lwz w1,-4(rs) | |
467 | 2: | |
468 | lwz w2,-8(rs) | |
469 | lwz w3,-12(rs) | |
470 | lwzu w4,-16(rs) | |
471 | stw w1,-4(rd) | |
472 | stw w2,-8(rd) | |
473 | stw w3,-12(rd) | |
474 | stwu w4,-16(rd) | |
475 | bdnz 1b | |
476 | ||
477 | b LShortReverse16 | |
478 | ||
479 | ||
480 | // Long operands. Use branch table to decide which loop to use. | |
481 | // w1 = (rd-rs), used to determine alignment | |
482 | ||
483 | LLong: | |
484 | xor w4,w1,rc // we must move reverse if (rd-rs)<rc | |
485 | mflr ra // save return address | |
486 | rlwinm w5,w1,1,27,30 // w5 <- ((w1 & 0xF) << 1) | |
487 | bcl 20,31,1f // use reserved form to get our location | |
488 | 1: | |
489 | mflr w3 // w3 == addr(1b) | |
490 | lis w8,0x0408 // load a 16 element, 2-bit array into w8... | |
491 | cntlzw w4,w4 // find first difference between (rd-rs) and rc | |
492 | addis w2,w3,ha16(LBranchTablePtr-1b) | |
493 | ori w8,w8,0x040C // ...used to map w5 to alignment encoding (ie, to 0-3) | |
494 | lwz w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address | |
495 | slw w4,rc,w4 // bit 0 of w4 set iff (rd-rs)<rc | |
496 | rlwnm w5,w8,w5,28,29 // put alignment encoding in bits 01100 of w5 | |
497 | rlwimi w2,w4,5,27,27 // put reverse bit in bit 10000 of branch table address | |
498 | lwzx w3,w2,w5 // w3 <- load loop address from branch table | |
499 | neg w1,rd // start to compute destination alignment | |
500 | mtctr w3 | |
501 | andi. r0,w1,0x1F // r0 <- bytes req'd to 32-byte align dest (if forward move) | |
502 | bctr // NB: r0/cr0 and w1 are passed as parameters | |
503 | ||
504 | ||
505 | // G3, forward, long, unaligned. | |
506 | // w1 = neg(rd) | |
507 | ||
508 | LForwardWord: | |
509 | andi. w3,w1,3 // W3 <- #bytes to word-align destination | |
510 | mtlr ra // restore return address | |
511 | sub rc,rc,w3 // adjust count for alignment | |
512 | srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1) | |
513 | mtctr r0 // prepare loop count | |
514 | beq+ 1f // dest already aligned | |
515 | ||
516 | lwz w2,0(rs) // get first 4 bytes of source | |
517 | lwzx w1,w3,rs // get source bytes we might overwrite | |
518 | add rs,rs,w3 // adjust source ptr | |
519 | stw w2,0(rd) // store all 4 bytes to avoid a loop | |
520 | add rd,rd,w3 // word-align destination | |
521 | b 2f | |
522 | 1: | |
523 | lwz w1,0(rs) | |
524 | 2: | |
525 | lwz w2,4(rs) | |
526 | lwz w3,8(rs) | |
527 | lwz w4,12(rs) | |
528 | lwz w5,16(rs) | |
529 | lwz w6,20(rs) | |
530 | lwz w7,24(rs) | |
531 | lwz w8,28(rs) | |
532 | addi rs,rs,32 | |
533 | stw w1,0(rd) | |
534 | stw w2,4(rd) | |
535 | stw w3,8(rd) | |
536 | stw w4,12(rd) | |
537 | stw w5,16(rd) | |
538 | stw w6,20(rd) | |
539 | stw w7,24(rd) | |
540 | stw w8,28(rd) | |
541 | addi rd,rd,32 | |
542 | bdnz 1b | |
543 | ||
544 | b LShort | |
545 | ||
546 | ||
547 | // G3, forward, long, word aligned. We use floating pt even when only word aligned. | |
548 | // w1 = neg(rd) | |
549 | ||
550 | LForwardFloat: | |
551 | andi. w3,w1,7 // W3 <- #bytes to doubleword-align destination | |
552 | mtlr ra // restore return address | |
553 | sub rc,rc,w3 // adjust count for alignment | |
554 | srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1) | |
555 | mtctr r0 // prepare loop count | |
556 | beq LForwardFloatLoop // dest already aligned | |
557 | ||
558 | lwz w1,0(rs) // get first 8 bytes of source | |
559 | lwz w2,4(rs) | |
560 | lfdx f0,w3,rs // get source bytes we might overwrite | |
561 | add rs,rs,w3 // word-align source ptr | |
562 | stw w1,0(rd) // store all 8 bytes to avoid a loop | |
563 | stw w2,4(rd) | |
564 | add rd,rd,w3 | |
565 | b LForwardFloatLoop1 | |
566 | ||
567 | .align 4 // align since this loop is executed by G4s too | |
568 | LForwardFloatLoop: | |
569 | lfd f0,0(rs) | |
570 | LForwardFloatLoop1: // enter here from LMediumAligned and above | |
571 | lfd f1,8(rs) | |
572 | lfd f2,16(rs) | |
573 | lfd f3,24(rs) | |
574 | addi rs,rs,32 | |
575 | stfd f0,0(rd) | |
576 | stfd f1,8(rd) | |
577 | stfd f2,16(rd) | |
578 | stfd f3,24(rd) | |
579 | addi rd,rd,32 | |
580 | bdnz LForwardFloatLoop | |
581 | ||
582 | b LShort | |
583 | ||
584 | ||
585 | // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT. | |
586 | // r0/cr0 = #bytes to 32-byte align | |
587 | ||
588 | LForwardVecAlig32Dcba: | |
589 | bnel+ LAlign32 // align destination iff necessary | |
590 | bl LPrepareForwardVectors | |
591 | mtlr ra // restore return address before loading c128 | |
592 | li c128,128 | |
593 | b 1f // enter aligned loop | |
594 | ||
595 | .align 5 // long loop heads should be at least 16-byte aligned | |
596 | 1: // loop over aligned 64-byte chunks | |
597 | dcbt c96,rs // pre-fetch three cache lines ahead | |
598 | dcbt c128,rs // and four | |
599 | lvx v1,0,rs | |
600 | lvx v2,c16,rs | |
601 | lvx v3,c32,rs | |
602 | lvx v4,c48,rs | |
603 | addi rs,rs,64 | |
604 | dcba 0,rd // avoid read of destination cache lines | |
605 | stvx v1,0,rd | |
606 | stvx v2,c16,rd | |
607 | dcba c32,rd | |
608 | stvx v3,c32,rd | |
609 | stvx v4,c48,rd | |
610 | addi rd,rd,64 | |
611 | bdnz 1b | |
612 | ||
613 | LForwardVectorAlignedEnd: // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7 | |
614 | beq- 3f // no leftover quadwords | |
615 | mtctr r0 | |
616 | 2: // loop over remaining quadwords (1-7) | |
617 | lvx v1,0,rs | |
618 | addi rs,rs,16 | |
619 | stvx v1,0,rd | |
620 | addi rd,rd,16 | |
621 | bdnz 2b | |
622 | 3: | |
623 | mtspr VRSave,rv // restore bitmap of live vr's | |
624 | bne cr6,LShort16 // handle last 0-15 bytes if any | |
625 | blr | |
626 | ||
627 | ||
628 | // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA. | |
629 | // r0/cr0 = #bytes to 32-byte align | |
630 | ||
631 | LForwardVecAlig32NoDcba: | |
632 | bnel+ LAlign32 // align destination iff necessary | |
633 | bl LPrepareForwardVectors | |
634 | mtlr ra // restore return address before loading c128 | |
635 | li c128,128 | |
636 | b 1f // enter aligned loop | |
637 | ||
638 | .align 4 // balance 13-word loop between QWs... | |
639 | nop // ...which improves performance 5% +/- | |
640 | nop | |
641 | 1: // loop over aligned 64-byte chunks | |
642 | dcbt c96,rs // pre-fetch three cache lines ahead | |
643 | dcbt c128,rs // and four | |
644 | lvx v1,0,rs | |
645 | lvx v2,c16,rs | |
646 | lvx v3,c32,rs | |
647 | lvx v4,c48,rs | |
648 | addi rs,rs,64 | |
649 | stvx v1,0,rd | |
650 | stvx v2,c16,rd | |
651 | stvx v3,c32,rd | |
652 | stvx v4,c48,rd | |
653 | addi rd,rd,64 | |
654 | bdnz 1b | |
655 | ||
656 | b LForwardVectorAlignedEnd | |
657 | ||
658 | ||
659 | // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA. At least on | |
660 | // some CPUs, this routine is no slower than the simpler aligned version that does | |
661 | // not use permutes. But it cannot be used with aligned operands, because of the | |
662 | // way it prefetches source QWs. | |
663 | // r0/cr0 = #bytes to 32-byte align | |
664 | ||
665 | LForwardVecUnal32Dcba: | |
666 | bnel+ LAlign32 // align destination iff necessary | |
667 | bl LPrepareForwardVectors | |
668 | lvx v1,0,rs // prime loop | |
669 | mtlr ra // restore return address before loading c128 | |
670 | lvsl vp,0,rs // get permute vector to shift left | |
671 | li c128,128 | |
672 | b 1f // enter aligned loop | |
673 | ||
674 | .align 4 // long loop heads should be at least 16-byte aligned | |
675 | 1: // loop over aligned 64-byte destination chunks | |
676 | lvx v2,c16,rs | |
677 | dcbt c96,rs // touch 3rd cache line ahead | |
678 | lvx v3,c32,rs | |
679 | dcbt c128,rs // touch 4th cache line ahead | |
680 | lvx v4,c48,rs | |
681 | addi rs,rs,64 | |
682 | vperm vx,v1,v2,vp | |
683 | lvx v1,0,rs | |
684 | vperm vy,v2,v3,vp | |
685 | dcba 0,rd // avoid read of destination lines | |
686 | stvx vx,0,rd | |
687 | vperm vz,v3,v4,vp | |
688 | stvx vy,c16,rd | |
689 | dcba c32,rd | |
690 | vperm vx,v4,v1,vp | |
691 | stvx vz,c32,rd | |
692 | stvx vx,c48,rd | |
693 | addi rd,rd,64 | |
694 | bdnz 1b | |
695 | ||
696 | LForwardVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7 | |
697 | beq- 3f // no leftover quadwords | |
698 | mtctr r0 | |
699 | 2: // loop over remaining quadwords | |
700 | lvx v2,c16,rs | |
701 | addi rs,rs,16 | |
702 | vperm vx,v1,v2,vp | |
703 | vor v1,v2,v2 // v1 <- v2 | |
704 | stvx vx,0,rd | |
705 | addi rd,rd,16 | |
706 | bdnz 2b | |
707 | 3: | |
708 | mtspr VRSave,rv // restore bitmap of live vr's | |
709 | bne cr6,LShort16 // handle last 0-15 bytes if any | |
710 | blr | |
711 | ||
712 | ||
713 | // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA. | |
714 | // r0/cr0 = #bytes to 32-byte align | |
715 | ||
716 | LForwardVecUnal32NoDcba: | |
717 | bnel+ LAlign32 // align destination iff necessary | |
718 | bl LPrepareForwardVectors | |
719 | lvx v1,0,rs // prime loop | |
720 | mtlr ra // restore return address before loading c128 | |
721 | lvsl vp,0,rs // get permute vector to shift left | |
722 | li c128,128 | |
723 | b 1f // enter aligned loop | |
724 | ||
725 | .align 4 | |
726 | nop // balance 17-word loop between QWs | |
727 | nop | |
728 | 1: // loop over aligned 64-byte destination chunks | |
729 | lvx v2,c16,rs | |
730 | dcbt c96,rs // touch 3rd cache line ahead | |
731 | lvx v3,c32,rs | |
732 | dcbt c128,rs // touch 4th cache line ahead | |
733 | lvx v4,c48,rs | |
734 | addi rs,rs,64 | |
735 | vperm vx,v1,v2,vp | |
736 | lvx v1,0,rs | |
737 | vperm vy,v2,v3,vp | |
738 | stvx vx,0,rd | |
739 | vperm vz,v3,v4,vp | |
740 | stvx vy,c16,rd | |
741 | vperm vx,v4,v1,vp | |
742 | stvx vz,c32,rd | |
743 | stvx vx,c48,rd | |
744 | addi rd,rd,64 | |
745 | bdnz 1b | |
746 | ||
747 | b LForwardVectorUnalignedEnd | |
748 | ||
749 | ||
750 | // G3 Reverse, long, unaligned. | |
751 | ||
752 | LReverseWord: | |
753 | bl LAlign8Reverse // 8-byte align destination | |
754 | mtlr ra // restore return address | |
755 | srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1) | |
756 | mtctr r0 | |
757 | 1: | |
758 | lwz w1,-4(rs) | |
759 | lwz w2,-8(rs) | |
760 | lwz w3,-12(rs) | |
761 | lwz w4,-16(rs) | |
762 | stw w1,-4(rd) | |
763 | lwz w5,-20(rs) | |
764 | stw w2,-8(rd) | |
765 | lwz w6,-24(rs) | |
766 | stw w3,-12(rd) | |
767 | lwz w7,-28(rs) | |
768 | stw w4,-16(rd) | |
769 | lwzu w8,-32(rs) | |
770 | stw w5,-20(rd) | |
771 | stw w6,-24(rd) | |
772 | stw w7,-28(rd) | |
773 | stwu w8,-32(rd) | |
774 | bdnz 1b | |
775 | ||
776 | b LShortReverse | |
777 | ||
778 | ||
779 | // G3 Reverse, long, word aligned. | |
780 | ||
781 | LReverseFloat: | |
782 | bl LAlign8Reverse // 8-byte align | |
783 | mtlr ra // restore return address | |
784 | srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1) | |
785 | mtctr r0 | |
786 | 1: | |
787 | lfd f0,-8(rs) | |
788 | lfd f1,-16(rs) | |
789 | lfd f2,-24(rs) | |
790 | lfdu f3,-32(rs) | |
791 | stfd f0,-8(rd) | |
792 | stfd f1,-16(rd) | |
793 | stfd f2,-24(rd) | |
794 | stfdu f3,-32(rd) | |
795 | bdnz 1b | |
796 | ||
797 | b LShortReverse | |
798 | ||
799 | ||
800 | // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA. | |
801 | ||
802 | LReverseVectorAligned32: | |
803 | bl LAlign32Reverse // 32-byte align destination iff necessary | |
804 | bl LPrepareReverseVectors | |
805 | mtlr ra // restore return address before loading cm129 | |
806 | li cm129,-129 | |
807 | b 1f // enter aligned loop | |
808 | ||
809 | .align 4 | |
810 | nop // must start in 3rd word of QW... | |
811 | nop // ...to keep balanced | |
812 | 1: // loop over aligned 64-byte chunks | |
813 | dcbt cm97,rs // pre-fetch three cache lines ahead | |
814 | dcbt cm129,rs // and four | |
815 | lvx v1,cm1,rs | |
816 | lvx v2,cm17,rs | |
817 | lvx v3,cm33,rs | |
818 | lvx v4,cm49,rs | |
819 | subi rs,rs,64 | |
820 | stvx v1,cm1,rd | |
821 | stvx v2,cm17,rd | |
822 | stvx v3,cm33,rd | |
823 | stvx v4,cm49,rd | |
824 | subi rd,rd,64 | |
825 | bdnz 1b | |
826 | ||
827 | LReverseVectorAlignedEnd: // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7 | |
828 | beq 3f // no leftover quadwords | |
829 | mtctr r0 | |
830 | 2: // loop over 1-3 quadwords | |
831 | lvx v1,cm1,rs | |
832 | subi rs,rs,16 | |
833 | stvx v1,cm1,rd | |
834 | subi rd,rd,16 | |
835 | bdnz 2b | |
836 | 3: | |
837 | mtspr VRSave,rv // restore bitmap of live vr's | |
838 | bne cr6,LShortReverse16 // handle last 0-15 bytes iff any | |
839 | blr | |
840 | ||
841 | ||
842 | // G4 Reverse, long, unaligned, 32-byte DCBT. | |
843 | ||
844 | LReverseVectorUnal32: | |
845 | bl LAlign32Reverse // align destination iff necessary | |
846 | bl LPrepareReverseVectors | |
847 | lvx v1,cm1,rs // prime loop | |
848 | mtlr ra // restore return address before loading cm129 | |
849 | lvsl vp,0,rs // get permute vector to shift left | |
850 | li cm129,-129 | |
851 | b 1f // enter aligned loop | |
852 | ||
853 | .align 4 | |
854 | nop // start loop in 3rd word on QW to balance | |
855 | nop | |
856 | 1: // loop over aligned 64-byte destination chunks | |
857 | lvx v2,cm17,rs | |
858 | dcbt cm97,rs // touch in 3rd source block | |
859 | lvx v3,cm33,rs | |
860 | dcbt cm129,rs // touch in 4th | |
861 | lvx v4,cm49,rs | |
862 | subi rs,rs,64 | |
863 | vperm vx,v2,v1,vp | |
864 | lvx v1,cm1,rs | |
865 | vperm vy,v3,v2,vp | |
866 | stvx vx,cm1,rd | |
867 | vperm vz,v4,v3,vp | |
868 | stvx vy,cm17,rd | |
869 | vperm vx,v1,v4,vp | |
870 | stvx vz,cm33,rd | |
871 | stvx vx,cm49,rd | |
872 | subi rd,rd,64 | |
873 | bdnz 1b | |
874 | ||
875 | LReverseVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7 | |
876 | beq 3f // no leftover quadwords | |
877 | mtctr r0 | |
878 | 2: // loop over 1-3 quadwords | |
879 | lvx v2,cm17,rs | |
880 | subi rs,rs,16 | |
881 | vperm vx,v2,v1,vp | |
882 | vor v1,v2,v2 // v1 <- v2 | |
883 | stvx vx,cm1,rd | |
884 | subi rd,rd,16 | |
885 | bdnz 2b | |
886 | 3: | |
887 | mtspr VRSave,rv // restore bitmap of live vr's | |
888 | bne cr6,LShortReverse16 // handle last 0-15 bytes iff any | |
889 | blr | |
890 | ||
891 | ||
892 | // Subroutine to prepare for 64-byte forward vector loops. | |
893 | // Returns many things: | |
894 | // ctr = number of 64-byte chunks to move | |
895 | // r0/cr0 = leftover QWs to move | |
896 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
897 | // cr6 = beq if leftover byte count is 0 | |
898 | // c16..c96 loaded | |
899 | // rv = original value of VRSave | |
900 | // NB: c128 not set (if needed), since it is still "ra" | |
901 | ||
902 | LPrepareForwardVectors: | |
903 | mfspr rv,VRSave // get bitmap of live vector registers | |
904 | srwi r0,rc,6 // get count of 64-byte chunks to move (>=1) | |
905 | oris w1,rv,0xFF00 // we use v0-v7 | |
906 | mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShort16 | |
907 | rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too | |
908 | mtspr VRSave,w1 // update mask | |
909 | li c16,16 // get constants used in ldvx/stvx | |
910 | li c32,32 | |
911 | mtctr r0 // set up loop count | |
912 | cmpwi cr6,w3,0 // set cr6 on leftover byte count | |
913 | li c48,48 | |
914 | li c96,96 | |
915 | rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 | |
916 | blr | |
917 | ||
918 | ||
919 | // Subroutine to prepare for 64-byte reverse vector loops. | |
920 | // Returns many things: | |
921 | // ctr = number of 64-byte chunks to move | |
922 | // r0/cr0 = leftover QWs to move | |
923 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
924 | // cr6 = beq if leftover byte count is 0 | |
925 | // cm1..cm97 loaded | |
926 | // rv = original value of VRSave | |
927 | // NB: cm129 not set (if needed), since it is still "ra" | |
928 | ||
929 | LPrepareReverseVectors: | |
930 | mfspr rv,VRSave // get bitmap of live vector registers | |
931 | srwi r0,rc,6 // get count of 64-byte chunks to move (>=1) | |
932 | oris w1,rv,0xFF00 // we use v0-v7 | |
933 | mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16 | |
934 | rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too | |
935 | mtspr VRSave,w1 // update mask | |
936 | li cm1,-1 // get constants used in ldvx/stvx | |
937 | li cm17,-17 | |
938 | mtctr r0 // set up loop count | |
939 | cmpwi cr6,w3,0 // set cr6 on leftover byte count | |
940 | li cm33,-33 | |
941 | li cm49,-49 | |
942 | rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 | |
943 | li cm97,-97 | |
944 | blr | |
945 | ||
946 | ||
947 | // Subroutine to align destination on a 32-byte boundary. | |
948 | // r0 = number of bytes to xfer (0-31) | |
949 | ||
950 | LAlign32: | |
951 | mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time) | |
952 | mtcrf 0x02,r0 | |
953 | sub rc,rc,r0 // adjust length | |
954 | bf 31,1f // skip if no odd bit | |
955 | lbz w1,0(rs) | |
956 | addi rs,rs,1 | |
957 | stb w1,0(rd) | |
958 | addi rd,rd,1 | |
959 | 1: | |
960 | bf 30,2f // halfword to move? | |
961 | lhz w1,0(rs) | |
962 | addi rs,rs,2 | |
963 | sth w1,0(rd) | |
964 | addi rd,rd,2 | |
965 | 2: | |
966 | bf 29,3f // word? | |
967 | lwz w1,0(rs) | |
968 | addi rs,rs,4 | |
969 | stw w1,0(rd) | |
970 | addi rd,rd,4 | |
971 | 3: | |
972 | bf 28,4f // doubleword? | |
973 | lwz w1,0(rs) | |
974 | lwz w2,4(rs) | |
975 | addi rs,rs,8 | |
976 | stw w1,0(rd) | |
977 | stw w2,4(rd) | |
978 | addi rd,rd,8 | |
979 | 4: | |
980 | bflr 27 // done if no quadword to move | |
981 | lwz w1,0(rs) | |
982 | lwz w2,4(rs) | |
983 | lwz w3,8(rs) | |
984 | lwz w4,12(rs) | |
985 | addi rs,rs,16 | |
986 | stw w1,0(rd) | |
987 | stw w2,4(rd) | |
988 | stw w3,8(rd) | |
989 | stw w4,12(rd) | |
990 | addi rd,rd,16 | |
991 | blr | |
992 | ||
993 | // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves. | |
994 | // rs and rd still point to low end of operands | |
995 | // we adjust rs and rd to point to last byte moved | |
996 | ||
997 | LAlign32Reverse: | |
998 | add rd,rd,rc // point to last byte moved (ie, 1 past end of operands) | |
999 | add rs,rs,rc | |
1000 | andi. r0,rd,0x1F // r0 <- #bytes that must be moved to align destination | |
1001 | mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time) | |
1002 | mtcrf 0x02,r0 | |
1003 | sub rc,rc,r0 // update length | |
1004 | beqlr- // destination already 32-byte aligned | |
1005 | ||
1006 | bf 31,1f // odd byte? | |
1007 | lbzu w1,-1(rs) | |
1008 | stbu w1,-1(rd) | |
1009 | 1: | |
1010 | bf 30,2f // halfword to move? | |
1011 | lhzu w1,-2(rs) | |
1012 | sthu w1,-2(rd) | |
1013 | 2: | |
1014 | bf 29,3f // word? | |
1015 | lwzu w1,-4(rs) | |
1016 | stwu w1,-4(rd) | |
1017 | 3: | |
1018 | bf 28,4f // doubleword? | |
1019 | lwz w1,-4(rs) | |
1020 | lwzu w2,-8(rs) | |
1021 | stw w1,-4(rd) | |
1022 | stwu w2,-8(rd | |
1023 | 4: | |
1024 | bflr 27 // done if no quadwords | |
1025 | lwz w1,-4(rs) | |
1026 | lwz w2,-8(rs) | |
1027 | lwz w3,-12(rs) | |
1028 | lwzu w4,-16(rs) | |
1029 | stw w1,-4(rd) | |
1030 | stw w2,-8(rd) | |
1031 | stw w3,-12(rd) | |
1032 | stwu w4,-16(rd) | |
1033 | blr | |
1034 | ||
1035 | ||
1036 | // Subroutine to align destination on an 8-byte boundary for reverse moves. | |
1037 | // rs and rd still point to low end of operands | |
1038 | // we adjust rs and rd to point to last byte moved | |
1039 | ||
1040 | LAlign8Reverse: | |
1041 | add rd,rd,rc // point to last byte moved (ie, 1 past end of operands) | |
1042 | add rs,rs,rc | |
1043 | andi. r0,rd,0x7 // r0 <- #bytes that must be moved to align destination | |
1044 | beqlr- // destination already 8-byte aligned | |
1045 | mtctr r0 // set up for loop | |
1046 | sub rc,rc,r0 // update length | |
1047 | 1: | |
1048 | lbzu w1,-1(rs) | |
1049 | stbu w1,-1(rd) | |
1050 | bdnz 1b | |
1051 | ||
1052 | blr | |
1053 | ||
1054 | ||
1055 | // Called by pthread initialization to set up the branch table pointer based on | |
1056 | // the CPU capability vector. This routine may be called more than once (for | |
1057 | // example, during testing.) | |
1058 | ||
1059 | // Size of the buffer we use to do DCBA timing on G4: | |
1060 | #define kBufSiz 1024 | |
1061 | ||
1062 | // Stack frame size, which contains the 128-byte-aligned buffer: | |
1063 | #define kSFSize (kBufSiz+128+16) | |
1064 | ||
1065 | // Iterations of the timing loop: | |
1066 | #define kLoopCnt 5 | |
1067 | ||
1068 | // Bit in cr5 used as a flag in timing loop: | |
1069 | #define kDCBA 22 | |
1070 | ||
1071 | __bcopy_initialize: // int _bcopy_initialize(void) | |
1072 | mflr ra // get return | |
1073 | stw ra,8(r1) // save | |
1074 | stwu r1,-kSFSize(r1) // carve our temp buffer from the stack | |
1075 | addi w6,r1,127+16 // get base address... | |
1076 | rlwinm w6,w6,0,0,24 // ...of our buffer, 128-byte aligned | |
1077 | bcl 20,31,1f // get our PIC base | |
1078 | 1: | |
1079 | mflr w1 | |
1080 | addis w2,w1,ha16(__cpu_capabilities - 1b) | |
1081 | lwz w3,lo16(__cpu_capabilities - 1b)(w2) | |
1082 | andi. r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec | |
1083 | cmpwi r0,kCache32+kHasAltivec // untyped G4? | |
1084 | li w8,0 // assume no need to test | |
1085 | bne 2f // not an untyped G4, so do not test | |
1086 | ||
1087 | // G4, but neither kUseDcba or kNoDcba are set. Time and select fastest. | |
1088 | ||
1089 | crset kDCBA // first, use DCBA | |
1090 | bl LTest32 // time it | |
1091 | mr w8,w4 // w8 <- best time using DCBA | |
1092 | srwi r0,w8,3 // bias 12 pct in favor of not using DCBA... | |
1093 | add w8,w8,r0 // ...because DCBA is always slower with warm cache | |
1094 | crclr kDCBA | |
1095 | bl LTest32 // w4 <- best time without DCBA | |
1096 | cmplw w8,w4 // which is better? | |
1097 | li w8,kUseDcba // assume using DCBA is faster | |
1098 | blt 2f | |
1099 | li w8,kNoDcba // no DCBA is faster | |
1100 | ||
1101 | // What branch table to use? | |
1102 | ||
1103 | 2: // here with w8 = 0, kUseDcba, or kNoDcba | |
1104 | bcl 20,31,4f // get our PIC base again | |
1105 | 4: | |
1106 | mflr w1 | |
1107 | addis w2,w1,ha16(__cpu_capabilities - 4b) | |
1108 | lwz w3,lo16(__cpu_capabilities - 4b)(w2) | |
1109 | or w3,w3,w8 // add in kUseDcba or kNoDcba if untyped G4 | |
1110 | mr r3,w8 // return dynamic selection, if any (used in testing) | |
1111 | ||
1112 | andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba | |
1113 | cmpwi r0,kHasAltivec+kCache32+kUseDcba // G4 with DCBA? | |
1114 | addis w4,w1,ha16(LG4UseDcba - 4b) | |
1115 | addi w4,w4,lo16(LG4UseDcba - 4b) | |
1116 | beq 5f | |
1117 | ||
1118 | andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba | |
1119 | cmpwi r0,kHasAltivec+kCache32+kNoDcba // G4 without DCBA? | |
1120 | addis w4,w1,ha16(LG4NoDcba - 4b) | |
1121 | addi w4,w4,lo16(LG4NoDcba - 4b) | |
1122 | beq 5f | |
1123 | ||
1124 | andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32 | |
1125 | cmpwi r0,kCache32 // G3? | |
1126 | addis w4,w1,ha16(LG3 - 4b) | |
1127 | addi w4,w4,lo16(LG3 - 4b) | |
1128 | beq 5f | |
1129 | ||
1130 | // Map unrecognized CPU types to G3 (lowest common denominator) | |
1131 | ||
1132 | 5: // w4 <- branch table pointer | |
1133 | addis w5,w1,ha16(LBranchTablePtr - 4b) | |
1134 | stw w4,lo16(LBranchTablePtr - 4b)(w5) | |
1135 | lwz ra,kSFSize+8(r1) // recover return address | |
1136 | mtlr ra // restore it | |
1137 | lwz r1,0(r1) // pop off our stack frame | |
1138 | blr // return dynamic selection (or 0) in r3 | |
1139 | ||
1140 | ||
1141 | // Subroutine to time a 32-byte cache. | |
1142 | // kDCBA = set if we should use DCBA | |
1143 | // w6 = base of buffer to use for test (kBufSiz bytes) | |
1144 | // w4 = we return time of fastest loop in w4 | |
1145 | ||
1146 | LTest32: | |
1147 | li w1,kLoopCnt // number of times to loop | |
1148 | li w4,-1 // initialize fastest time | |
1149 | 1: | |
1150 | mr rd,w6 // initialize buffer ptr | |
1151 | li r0,kBufSiz/32 // r0 <- cache blocks to test | |
1152 | mtctr r0 | |
1153 | 2: | |
1154 | dcbf 0,rd // first, force the blocks out of the cache | |
1155 | addi rd,rd,32 | |
1156 | bdnz 2b | |
1157 | sync // make sure all the flushes take | |
1158 | mr rd,w6 // re-initialize buffer ptr | |
1159 | mtctr r0 // reset cache-block count | |
1160 | mftbu w5 // remember upper half so we can check for carry | |
1161 | mftb w2 // start the timer | |
1162 | 3: // loop over cache blocks | |
1163 | bf kDCBA,4f // should we DCBA? | |
1164 | dcba 0,rd | |
1165 | 4: | |
1166 | stfd f1,0(rd) // store the entire cache block | |
1167 | stfd f1,8(rd) | |
1168 | stfd f1,16(rd) | |
1169 | stfd f1,24(rd) | |
1170 | addi rd,rd,32 | |
1171 | bdnz 3b | |
1172 | mftb w3 | |
1173 | mftbu r0 | |
1174 | cmpw r0,w5 // did timebase carry? | |
1175 | bne 1b // yes, retest rather than fuss | |
1176 | sub w3,w3,w2 // w3 <- time for this loop | |
1177 | cmplw w3,w4 // faster than current best? | |
1178 | bge 5f // no | |
1179 | mr w4,w3 // remember fastest time through loop | |
1180 | 5: | |
1181 | subi w1,w1,1 // decrement outer loop count | |
1182 | cmpwi w1,0 // more to go? | |
1183 | bne 1b // loop if so | |
1184 | blr | |
734aad71 A |
1185 | |
1186 | #endif /* 0 */ |