]> git.saurik.com Git - apple/libc.git/blob - ppc/gen/bcopy.s
Libc-262.2.12.tar.gz
[apple/libc.git] / ppc / gen / bcopy.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25
26 #define __APPLE_API_PRIVATE
27 #include <machine/cpu_capabilities.h>
28 #undef __APPLE_API_PRIVATE
29
30 // These functions have migrated to the comm page.
31
32 .text
33 .globl _bcopy
34 .globl _memcpy
35 .globl _memmove
36
37 .align 5
38 _bcopy: // void bcopy(const void *src, void *dst, size_t len)
39 ba _COMM_PAGE_BCOPY
40
41 .align 5
42 _memcpy: // void* memcpy(void *dst, void *src, size_t len)
43 _memmove: // void* memmove(void *dst, const void *src, size_t len)
44 ba _COMM_PAGE_MEMCPY
45
46
47 #if 0
48 /* =======================================
49 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
50 * =======================================
51 *
52 * Version of 6/17/2002, for G3, G4, and G4+.
53 *
54 * There are many paths through this code, depending on length, reverse/forward,
55 * processor type, and alignment. We use reverse paths only when the operands
56 * overlap and the destination is higher than the source. They are not quite as
57 * fast as the forward paths.
58 *
59 * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
60 * the inner loops for long operands. DST is less effective than DCBT, because it
61 * can get out of sync with the inner loop. DCBTST is usually not a win, so we
62 * don't use it except during initialization when we're not using the LSU.
63 * We don't DCBT on G3, which only handles one load miss at a time.
64 *
65 * We don't use DCBZ, because it takes an alignment exception on uncached memory
66 * like frame buffers. Bcopy to frame buffers must work. This hurts G3 in the
67 * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
68 *
69 * Using DCBA on G4 is a tradeoff. For the cold-cache case it can be a big win,
70 * since it avoids the read of destination cache lines. But for the hot-cache case
71 * it is always slower, because of the cycles spent needlessly zeroing data. Some
72 * machines store-gather and can cancel the read if all bytes of a line are stored,
73 * others cannot. Unless explicitly told which is better, we time loops with and
74 * without DCBA and use the fastest. Note that we never DCBA in reverse loops,
75 * since by definition they are overlapped so dest lines will be in the cache.
76 *
77 * For longer operands we use an 8-element branch table, based on the CPU type,
78 * to select the appropriate inner loop. The branch table is indexed as follows:
79 *
80 * bit 10000 set if a Reverse move is required
81 * bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
82 * 2=doubleword, and 3=quadword.
83 *
84 * By "relatively" n-byte aligned, we mean the source and destination are a multiple
85 * of n bytes apart (they need not be absolutely aligned.)
86 *
87 * The branch table for the running CPU type is pointed to by LBranchTablePtr.
88 * Initially, LBranchtablePtr points to G3's table, since that is the lowest
89 * common denominator that will run on any CPU. Later, pthread initialization
90 * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
91 * up the correct pointer for the running CPU.
92 *
93 * We distinguish between "short", "medium", and "long" operands:
94 * short (<= 32 bytes) most common case, minimum path length is important
95 * medium (> 32, < kLong) too short for Altivec or use of cache ops like DCBA
96 * long (>= kLong) long enough for cache ops and to amortize use of Altivec
97 *
98 * WARNING: kLong must be >=96, due to implicit assumptions about operand length.
99 */
100 #define kLong 96
101
102 /* Register usage. Note we use R2, so this code will not run in a PEF/CFM
103 * environment. Note also the rather delicate way we assign multiple uses
104 * to the same register. Beware.
105 *
106 * r0 = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
107 * r2 = "w8" or VRSave ("rv")
108 * r3 = not used, as memcpy and memmove return 1st parameter as a value
109 * r4 = source ptr ("rs")
110 * r5 = count of bytes to move ("rc")
111 * r6 = "w1", "c16", or "cm17"
112 * r7 = "w2", "c32", or "cm33"
113 * r8 = "w3", "c48", or "cm49"
114 * r9 = "w4", "c64", or "cm1"
115 * r10 = "w5", "c96", or "cm97"
116 * r11 = "w6", "c128", "cm129", or return address ("ra")
117 * r12 = destination ptr ("rd")
118 * f0-f8 = used for moving 8-byte aligned data
119 * v0 = permute vector ("vp")
120 * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
121 * v5-v7 = permuted qw's ("vx", "vy", and "vz")
122 */
123 #define rs r4
124 #define rd r12
125 #define rc r5
126 #define ra r11
127 #define rv r2
128
129 #define w1 r6
130 #define w2 r7
131 #define w3 r8
132 #define w4 r9
133 #define w5 r10
134 #define w6 r11
135 #define w7 r0
136 #define w8 r2
137
138 #define c16 r6
139 #define cm17 r6
140 #define c32 r7
141 #define cm33 r7
142 #define c48 r8
143 #define cm49 r8
144 #define c64 r9
145 #define cm1 r9
146 #define c96 r10
147 #define cm97 r10
148 #define c128 r11
149 #define cm129 r11
150
151 #define vp v0
152 #define vx v5
153 #define vy v6
154 #define vz v7
155
156 #define VRSave 256
157
158 #include <architecture/ppc/asm_help.h>
159
160 // The branch tables, 8 entries per CPU type.
161 // NB: we depend on 5 low-order 0s in the address of branch tables.
162
163 .data
164 .align 5 // must be 32-byte aligned
165
166 // G3 (the default CPU type)
167
168 LG3:
169 .long LForwardWord // 000: forward, unaligned
170 .long LForwardFloat // 001: forward, 4-byte aligned
171 .long LForwardFloat // 010: forward, 8-byte aligned
172 .long LForwardFloat // 011: forward, 16-byte aligned
173 .long LReverseWord // 100: reverse, unaligned
174 .long LReverseFloat // 101: reverse, 4-byte aligned
175 .long LReverseFloat // 110: reverse, 8-byte aligned
176 .long LReverseFloat // 111: reverse, 16-byte aligned
177
178 // G4s that benefit from DCBA.
179
180 LG4UseDcba:
181 .long LForwardVecUnal32Dcba // 000: forward, unaligned
182 .long LForwardVecUnal32Dcba // 001: forward, 4-byte aligned
183 .long LForwardVecUnal32Dcba // 010: forward, 8-byte aligned
184 .long LForwardVecAlig32Dcba // 011: forward, 16-byte aligned
185 .long LReverseVectorUnal32 // 100: reverse, unaligned
186 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
187 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
188 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
189
190 // G4s that should not use DCBA.
191
192 LG4NoDcba:
193 .long LForwardVecUnal32NoDcba // 000: forward, unaligned
194 .long LForwardVecUnal32NoDcba // 001: forward, 4-byte aligned
195 .long LForwardVecUnal32NoDcba // 010: forward, 8-byte aligned
196 .long LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
197 .long LReverseVectorUnal32 // 100: reverse, unaligned
198 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
199 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
200 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
201
202
203 // Pointer to the 8-element branch table for running CPU type:
204
205 LBranchTablePtr:
206 .long LG3 // default to G3 until "bcopy_initialize" called
207
208
209 // The CPU capability vector, initialized in pthread_init().
210 // "_bcopy_initialize" uses this to set up LBranchTablePtr:
211
212 .globl __cpu_capabilities
213 __cpu_capabilities:
214 .long 0
215
216 // Bit definitions for _cpu_capabilities:
217
218 #define kHasAltivec 0x01
219 #define k64Bit 0x02
220 #define kCache32 0x04
221 #define kCache64 0x08
222 #define kCache128 0x10
223 #define kUseDcba 0x20
224 #define kNoDcba 0x40
225
226
227 .text
228 .globl _bcopy
229 .globl _memcpy
230 .globl _memmove
231 .globl __bcopy_initialize
232
233
234 // Main entry points.
235
236 .align 5
237 _bcopy: // void bcopy(const void *src, void *dst, size_t len)
238 mr r10,r3 // reverse source and dest ptrs, to be like memcpy
239 mr r3,r4
240 mr r4,r10
241 _memcpy: // void* memcpy(void *dst, void *src, size_t len)
242 _memmove: // void* memmove(void *dst, const void *src, size_t len)
243 cmplwi cr7,rc,32 // length <= 32 bytes?
244 sub. w1,r3,rs // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
245 dcbt 0,rs // touch in the first line of source
246 cmplw cr6,w1,rc // set cr6 blt iff we must move reverse
247 cmplwi cr1,rc,kLong-1 // set cr1 bgt if long
248 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
249 bgt- cr7,LMedium // longer than 32 bytes
250 dcbtst 0,rd // touch in destination
251 beq- cr7,LMove32 // special case moves of 32 bytes
252 blt- cr6,LShortReverse0
253
254 // Forward short operands. This is the most frequent case, so it is inline.
255 // We also end up here to xfer the last 0-31 bytes of longer operands.
256
257 LShort: // WARNING: can fall into this routine
258 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
259 mtcrf 0x01,rc // move rest of length to cr7
260 beq 1f // quadword to move?
261 lwz w1,0(rs)
262 lwz w2,4(rs)
263 lwz w3,8(rs)
264 lwz w4,12(rs)
265 addi rs,rs,16
266 stw w1,0(rd)
267 stw w2,4(rd)
268 stw w3,8(rd)
269 stw w4,12(rd)
270 addi rd,rd,16
271 1:
272 LShort16: // join here to xfer 0-15 bytes
273 bf 28,2f // doubleword?
274 lwz w1,0(rs)
275 lwz w2,4(rs)
276 addi rs,rs,8
277 stw w1,0(rd)
278 stw w2,4(rd)
279 addi rd,rd,8
280 2:
281 bf 29,3f // word?
282 lwz w1,0(rs)
283 addi rs,rs,4
284 stw w1,0(rd)
285 addi rd,rd,4
286 3:
287 bf 30,4f // halfword to move?
288 lhz w1,0(rs)
289 addi rs,rs,2
290 sth w1,0(rd)
291 addi rd,rd,2
292 4:
293 bflr 31 // skip if no odd byte
294 lbz w1,0(rs)
295 stb w1,0(rd)
296 blr
297
298
299 // Handle short reverse operands, up to kShort in length.
300 // This is also used to transfer the last 0-31 bytes of longer operands.
301
302 LShortReverse0:
303 add rs,rs,rc // adjust ptrs for reverse move
304 add rd,rd,rc
305 LShortReverse:
306 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
307 mtcrf 0x01,rc // move rest of length to cr7
308 beq 1f // quadword to move?
309 lwz w1,-4(rs)
310 lwz w2,-8(rs)
311 lwz w3,-12(rs)
312 lwzu w4,-16(rs)
313 stw w1,-4(rd)
314 stw w2,-8(rd)
315 stw w3,-12(rd)
316 stwu w4,-16(rd)
317 1:
318 LShortReverse16: // join here to xfer 0-15 bytes and return
319 bf 28,2f // doubleword?
320 lwz w1,-4(rs)
321 lwzu w2,-8(rs)
322 stw w1,-4(rd)
323 stwu w2,-8(rd
324 2:
325 bf 29,3f // word?
326 lwzu w1,-4(rs)
327 stwu w1,-4(rd)
328 3:
329 bf 30,4f // halfword to move?
330 lhzu w1,-2(rs)
331 sthu w1,-2(rd)
332 4:
333 bflr 31 // done if no odd byte
334 lbz w1,-1(rs) // no update
335 stb w1,-1(rd)
336 blr
337
338
339 // Special case for 32-byte moves. Too long for LShort, too common for LMedium.
340
341 LMove32:
342 lwz w1,0(rs)
343 lwz w2,4(rs)
344 lwz w3,8(rs)
345 lwz w4,12(rs)
346 lwz w5,16(rs)
347 lwz w6,20(rs)
348 lwz w7,24(rs)
349 lwz w8,28(rs)
350 stw w1,0(rd)
351 stw w2,4(rd)
352 stw w3,8(rd)
353 stw w4,12(rd)
354 stw w5,16(rd)
355 stw w6,20(rd)
356 stw w7,24(rd)
357 stw w8,28(rd)
358 LExit:
359 blr
360
361
362 // Medium length operands (32 < rc < kLong.) These loops run on all CPUs, as the
363 // operands are not long enough to bother with the branch table, using cache ops, or
364 // Altivec. We word align the source, not the dest as we do for long operands,
365 // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
366 // operands, and the opportunity to cancel reads of dest cache lines is limited.
367 // w1 = (rd-rs), used to check for alignment
368 // cr0 = set on (rd-rs)
369 // cr1 = bgt if long operand
370 // cr6 = blt if reverse move
371
372 LMedium:
373 dcbtst 0,rd // touch in 1st line of destination
374 rlwinm r0,w1,0,29,31 // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
375 beq- LExit // early exit if (rs==rd), avoiding use of "beqlr"
376 neg w2,rs // we align source, not dest, and assume forward
377 cmpwi cr5,r0,0 // set cr5 beq if doubleword aligned
378 bgt- cr1,LLong // handle long operands
379 andi. w3,w2,3 // W3 <- #bytes to word-align source
380 blt- cr6,LMediumReverse // handle reverse move
381 lwz w1,0(rs) // pre-fetch first 4 bytes of source
382 beq- cr5,LMediumAligned // operands are doubleword aligned
383 sub rc,rc,w3 // adjust count for alignment
384 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShort16
385 srwi w4,rc,4 // w4 <- number of 16-byte chunks to xfer (>=1)
386 mtctr w4 // prepare loop count
387 beq+ 2f // source already aligned
388
389 lwzx w2,w3,rs // get 1st aligned word (which we might partially overwrite)
390 add rs,rs,w3 // word-align source ptr
391 stw w1,0(rd) // store all (w3) bytes at once to avoid a loop
392 add rd,rd,w3
393 mr w1,w2 // first aligned word to w1
394 b 2f
395
396 .align 4 // align inner loops
397 1: // loop over 16-byte chunks
398 lwz w1,0(rs)
399 2:
400 lwz w2,4(rs)
401 lwz w3,8(rs)
402 lwz w4,12(rs)
403 addi rs,rs,16
404 stw w1,0(rd)
405 stw w2,4(rd)
406 stw w3,8(rd)
407 stw w4,12(rd)
408 addi rd,rd,16
409 bdnz 1b
410
411 b LShort16
412
413
414 // Medium, doubleword aligned. We use floating point. Note that G4+ has bigger latencies
415 // and reduced throughput for floating pt loads and stores; future processors will probably
416 // have even worse lfd/stfd performance. We use it here because it is so important for G3,
417 // and not slower for G4+. But we only do so for doubleword aligned operands, whereas the
418 // G3-only long operand loops use floating pt even for word-aligned operands.
419 // w2 = neg(rs)
420 // w1 = first 4 bytes of source
421
422 LMediumAligned:
423 andi. w3,w2,7 // already aligned?
424 sub rc,rc,w3 // adjust count by 0-7 bytes
425 lfdx f0,rs,w3 // pre-fetch first aligned source doubleword
426 srwi w4,rc,5 // get count of 32-byte chunks (might be 0 if unaligned)
427 mtctr w4
428 beq- LForwardFloatLoop1 // already aligned
429
430 cmpwi w4,0 // are there any 32-byte chunks to xfer?
431 lwz w2,4(rs) // get 2nd (unaligned) source word
432 add rs,rs,w3 // doubleword align source pointer
433 stw w1,0(rd) // store first 8 bytes of source to align...
434 stw w2,4(rd) // ...which could overwrite source
435 add rd,rd,w3 // doubleword align destination
436 bne+ LForwardFloatLoop1 // at least 1 chunk, so enter loop
437
438 subi rc,rc,8 // unfortunate degenerate case: no chunks to xfer
439 stfd f0,0(rd) // must store f1 since source might have been overwriten
440 addi rs,rs,8
441 addi rd,rd,8
442 b LShort
443
444
445 // Medium reverse moves. This loop runs on all processors.
446
447 LMediumReverse:
448 add rs,rs,rc // point to other end of operands when in reverse
449 add rd,rd,rc
450 andi. w3,rs,3 // w3 <- #bytes to word align source
451 lwz w1,-4(rs) // pre-fetch 1st 4 bytes of source
452 sub rc,rc,w3 // adjust count
453 srwi w4,rc,4 // get count of 16-byte chunks (>=1)
454 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShortReverse16
455 mtctr w4 // prepare loop count
456 beq+ 2f // source already aligned
457
458 sub rs,rs,w3 // word-align source ptr
459 lwz w2,-4(rs) // get 1st aligned word which we may overwrite
460 stw w1,-4(rd) // store all 4 bytes to align without a loop
461 sub rd,rd,w3
462 mr w1,w2 // shift 1st aligned source word to w1
463 b 2f
464
465 1:
466 lwz w1,-4(rs)
467 2:
468 lwz w2,-8(rs)
469 lwz w3,-12(rs)
470 lwzu w4,-16(rs)
471 stw w1,-4(rd)
472 stw w2,-8(rd)
473 stw w3,-12(rd)
474 stwu w4,-16(rd)
475 bdnz 1b
476
477 b LShortReverse16
478
479
480 // Long operands. Use branch table to decide which loop to use.
481 // w1 = (rd-rs), used to determine alignment
482
483 LLong:
484 xor w4,w1,rc // we must move reverse if (rd-rs)<rc
485 mflr ra // save return address
486 rlwinm w5,w1,1,27,30 // w5 <- ((w1 & 0xF) << 1)
487 bcl 20,31,1f // use reserved form to get our location
488 1:
489 mflr w3 // w3 == addr(1b)
490 lis w8,0x0408 // load a 16 element, 2-bit array into w8...
491 cntlzw w4,w4 // find first difference between (rd-rs) and rc
492 addis w2,w3,ha16(LBranchTablePtr-1b)
493 ori w8,w8,0x040C // ...used to map w5 to alignment encoding (ie, to 0-3)
494 lwz w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
495 slw w4,rc,w4 // bit 0 of w4 set iff (rd-rs)<rc
496 rlwnm w5,w8,w5,28,29 // put alignment encoding in bits 01100 of w5
497 rlwimi w2,w4,5,27,27 // put reverse bit in bit 10000 of branch table address
498 lwzx w3,w2,w5 // w3 <- load loop address from branch table
499 neg w1,rd // start to compute destination alignment
500 mtctr w3
501 andi. r0,w1,0x1F // r0 <- bytes req'd to 32-byte align dest (if forward move)
502 bctr // NB: r0/cr0 and w1 are passed as parameters
503
504
505 // G3, forward, long, unaligned.
506 // w1 = neg(rd)
507
508 LForwardWord:
509 andi. w3,w1,3 // W3 <- #bytes to word-align destination
510 mtlr ra // restore return address
511 sub rc,rc,w3 // adjust count for alignment
512 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
513 mtctr r0 // prepare loop count
514 beq+ 1f // dest already aligned
515
516 lwz w2,0(rs) // get first 4 bytes of source
517 lwzx w1,w3,rs // get source bytes we might overwrite
518 add rs,rs,w3 // adjust source ptr
519 stw w2,0(rd) // store all 4 bytes to avoid a loop
520 add rd,rd,w3 // word-align destination
521 b 2f
522 1:
523 lwz w1,0(rs)
524 2:
525 lwz w2,4(rs)
526 lwz w3,8(rs)
527 lwz w4,12(rs)
528 lwz w5,16(rs)
529 lwz w6,20(rs)
530 lwz w7,24(rs)
531 lwz w8,28(rs)
532 addi rs,rs,32
533 stw w1,0(rd)
534 stw w2,4(rd)
535 stw w3,8(rd)
536 stw w4,12(rd)
537 stw w5,16(rd)
538 stw w6,20(rd)
539 stw w7,24(rd)
540 stw w8,28(rd)
541 addi rd,rd,32
542 bdnz 1b
543
544 b LShort
545
546
547 // G3, forward, long, word aligned. We use floating pt even when only word aligned.
548 // w1 = neg(rd)
549
550 LForwardFloat:
551 andi. w3,w1,7 // W3 <- #bytes to doubleword-align destination
552 mtlr ra // restore return address
553 sub rc,rc,w3 // adjust count for alignment
554 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
555 mtctr r0 // prepare loop count
556 beq LForwardFloatLoop // dest already aligned
557
558 lwz w1,0(rs) // get first 8 bytes of source
559 lwz w2,4(rs)
560 lfdx f0,w3,rs // get source bytes we might overwrite
561 add rs,rs,w3 // word-align source ptr
562 stw w1,0(rd) // store all 8 bytes to avoid a loop
563 stw w2,4(rd)
564 add rd,rd,w3
565 b LForwardFloatLoop1
566
567 .align 4 // align since this loop is executed by G4s too
568 LForwardFloatLoop:
569 lfd f0,0(rs)
570 LForwardFloatLoop1: // enter here from LMediumAligned and above
571 lfd f1,8(rs)
572 lfd f2,16(rs)
573 lfd f3,24(rs)
574 addi rs,rs,32
575 stfd f0,0(rd)
576 stfd f1,8(rd)
577 stfd f2,16(rd)
578 stfd f3,24(rd)
579 addi rd,rd,32
580 bdnz LForwardFloatLoop
581
582 b LShort
583
584
585 // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
586 // r0/cr0 = #bytes to 32-byte align
587
588 LForwardVecAlig32Dcba:
589 bnel+ LAlign32 // align destination iff necessary
590 bl LPrepareForwardVectors
591 mtlr ra // restore return address before loading c128
592 li c128,128
593 b 1f // enter aligned loop
594
595 .align 5 // long loop heads should be at least 16-byte aligned
596 1: // loop over aligned 64-byte chunks
597 dcbt c96,rs // pre-fetch three cache lines ahead
598 dcbt c128,rs // and four
599 lvx v1,0,rs
600 lvx v2,c16,rs
601 lvx v3,c32,rs
602 lvx v4,c48,rs
603 addi rs,rs,64
604 dcba 0,rd // avoid read of destination cache lines
605 stvx v1,0,rd
606 stvx v2,c16,rd
607 dcba c32,rd
608 stvx v3,c32,rd
609 stvx v4,c48,rd
610 addi rd,rd,64
611 bdnz 1b
612
613 LForwardVectorAlignedEnd: // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
614 beq- 3f // no leftover quadwords
615 mtctr r0
616 2: // loop over remaining quadwords (1-7)
617 lvx v1,0,rs
618 addi rs,rs,16
619 stvx v1,0,rd
620 addi rd,rd,16
621 bdnz 2b
622 3:
623 mtspr VRSave,rv // restore bitmap of live vr's
624 bne cr6,LShort16 // handle last 0-15 bytes if any
625 blr
626
627
628 // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
629 // r0/cr0 = #bytes to 32-byte align
630
631 LForwardVecAlig32NoDcba:
632 bnel+ LAlign32 // align destination iff necessary
633 bl LPrepareForwardVectors
634 mtlr ra // restore return address before loading c128
635 li c128,128
636 b 1f // enter aligned loop
637
638 .align 4 // balance 13-word loop between QWs...
639 nop // ...which improves performance 5% +/-
640 nop
641 1: // loop over aligned 64-byte chunks
642 dcbt c96,rs // pre-fetch three cache lines ahead
643 dcbt c128,rs // and four
644 lvx v1,0,rs
645 lvx v2,c16,rs
646 lvx v3,c32,rs
647 lvx v4,c48,rs
648 addi rs,rs,64
649 stvx v1,0,rd
650 stvx v2,c16,rd
651 stvx v3,c32,rd
652 stvx v4,c48,rd
653 addi rd,rd,64
654 bdnz 1b
655
656 b LForwardVectorAlignedEnd
657
658
659 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA. At least on
660 // some CPUs, this routine is no slower than the simpler aligned version that does
661 // not use permutes. But it cannot be used with aligned operands, because of the
662 // way it prefetches source QWs.
663 // r0/cr0 = #bytes to 32-byte align
664
665 LForwardVecUnal32Dcba:
666 bnel+ LAlign32 // align destination iff necessary
667 bl LPrepareForwardVectors
668 lvx v1,0,rs // prime loop
669 mtlr ra // restore return address before loading c128
670 lvsl vp,0,rs // get permute vector to shift left
671 li c128,128
672 b 1f // enter aligned loop
673
674 .align 4 // long loop heads should be at least 16-byte aligned
675 1: // loop over aligned 64-byte destination chunks
676 lvx v2,c16,rs
677 dcbt c96,rs // touch 3rd cache line ahead
678 lvx v3,c32,rs
679 dcbt c128,rs // touch 4th cache line ahead
680 lvx v4,c48,rs
681 addi rs,rs,64
682 vperm vx,v1,v2,vp
683 lvx v1,0,rs
684 vperm vy,v2,v3,vp
685 dcba 0,rd // avoid read of destination lines
686 stvx vx,0,rd
687 vperm vz,v3,v4,vp
688 stvx vy,c16,rd
689 dcba c32,rd
690 vperm vx,v4,v1,vp
691 stvx vz,c32,rd
692 stvx vx,c48,rd
693 addi rd,rd,64
694 bdnz 1b
695
696 LForwardVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
697 beq- 3f // no leftover quadwords
698 mtctr r0
699 2: // loop over remaining quadwords
700 lvx v2,c16,rs
701 addi rs,rs,16
702 vperm vx,v1,v2,vp
703 vor v1,v2,v2 // v1 <- v2
704 stvx vx,0,rd
705 addi rd,rd,16
706 bdnz 2b
707 3:
708 mtspr VRSave,rv // restore bitmap of live vr's
709 bne cr6,LShort16 // handle last 0-15 bytes if any
710 blr
711
712
713 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
714 // r0/cr0 = #bytes to 32-byte align
715
716 LForwardVecUnal32NoDcba:
717 bnel+ LAlign32 // align destination iff necessary
718 bl LPrepareForwardVectors
719 lvx v1,0,rs // prime loop
720 mtlr ra // restore return address before loading c128
721 lvsl vp,0,rs // get permute vector to shift left
722 li c128,128
723 b 1f // enter aligned loop
724
725 .align 4
726 nop // balance 17-word loop between QWs
727 nop
728 1: // loop over aligned 64-byte destination chunks
729 lvx v2,c16,rs
730 dcbt c96,rs // touch 3rd cache line ahead
731 lvx v3,c32,rs
732 dcbt c128,rs // touch 4th cache line ahead
733 lvx v4,c48,rs
734 addi rs,rs,64
735 vperm vx,v1,v2,vp
736 lvx v1,0,rs
737 vperm vy,v2,v3,vp
738 stvx vx,0,rd
739 vperm vz,v3,v4,vp
740 stvx vy,c16,rd
741 vperm vx,v4,v1,vp
742 stvx vz,c32,rd
743 stvx vx,c48,rd
744 addi rd,rd,64
745 bdnz 1b
746
747 b LForwardVectorUnalignedEnd
748
749
750 // G3 Reverse, long, unaligned.
751
752 LReverseWord:
753 bl LAlign8Reverse // 8-byte align destination
754 mtlr ra // restore return address
755 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
756 mtctr r0
757 1:
758 lwz w1,-4(rs)
759 lwz w2,-8(rs)
760 lwz w3,-12(rs)
761 lwz w4,-16(rs)
762 stw w1,-4(rd)
763 lwz w5,-20(rs)
764 stw w2,-8(rd)
765 lwz w6,-24(rs)
766 stw w3,-12(rd)
767 lwz w7,-28(rs)
768 stw w4,-16(rd)
769 lwzu w8,-32(rs)
770 stw w5,-20(rd)
771 stw w6,-24(rd)
772 stw w7,-28(rd)
773 stwu w8,-32(rd)
774 bdnz 1b
775
776 b LShortReverse
777
778
779 // G3 Reverse, long, word aligned.
780
781 LReverseFloat:
782 bl LAlign8Reverse // 8-byte align
783 mtlr ra // restore return address
784 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
785 mtctr r0
786 1:
787 lfd f0,-8(rs)
788 lfd f1,-16(rs)
789 lfd f2,-24(rs)
790 lfdu f3,-32(rs)
791 stfd f0,-8(rd)
792 stfd f1,-16(rd)
793 stfd f2,-24(rd)
794 stfdu f3,-32(rd)
795 bdnz 1b
796
797 b LShortReverse
798
799
800 // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
801
802 LReverseVectorAligned32:
803 bl LAlign32Reverse // 32-byte align destination iff necessary
804 bl LPrepareReverseVectors
805 mtlr ra // restore return address before loading cm129
806 li cm129,-129
807 b 1f // enter aligned loop
808
809 .align 4
810 nop // must start in 3rd word of QW...
811 nop // ...to keep balanced
812 1: // loop over aligned 64-byte chunks
813 dcbt cm97,rs // pre-fetch three cache lines ahead
814 dcbt cm129,rs // and four
815 lvx v1,cm1,rs
816 lvx v2,cm17,rs
817 lvx v3,cm33,rs
818 lvx v4,cm49,rs
819 subi rs,rs,64
820 stvx v1,cm1,rd
821 stvx v2,cm17,rd
822 stvx v3,cm33,rd
823 stvx v4,cm49,rd
824 subi rd,rd,64
825 bdnz 1b
826
827 LReverseVectorAlignedEnd: // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
828 beq 3f // no leftover quadwords
829 mtctr r0
830 2: // loop over 1-3 quadwords
831 lvx v1,cm1,rs
832 subi rs,rs,16
833 stvx v1,cm1,rd
834 subi rd,rd,16
835 bdnz 2b
836 3:
837 mtspr VRSave,rv // restore bitmap of live vr's
838 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
839 blr
840
841
842 // G4 Reverse, long, unaligned, 32-byte DCBT.
843
844 LReverseVectorUnal32:
845 bl LAlign32Reverse // align destination iff necessary
846 bl LPrepareReverseVectors
847 lvx v1,cm1,rs // prime loop
848 mtlr ra // restore return address before loading cm129
849 lvsl vp,0,rs // get permute vector to shift left
850 li cm129,-129
851 b 1f // enter aligned loop
852
853 .align 4
854 nop // start loop in 3rd word on QW to balance
855 nop
856 1: // loop over aligned 64-byte destination chunks
857 lvx v2,cm17,rs
858 dcbt cm97,rs // touch in 3rd source block
859 lvx v3,cm33,rs
860 dcbt cm129,rs // touch in 4th
861 lvx v4,cm49,rs
862 subi rs,rs,64
863 vperm vx,v2,v1,vp
864 lvx v1,cm1,rs
865 vperm vy,v3,v2,vp
866 stvx vx,cm1,rd
867 vperm vz,v4,v3,vp
868 stvx vy,cm17,rd
869 vperm vx,v1,v4,vp
870 stvx vz,cm33,rd
871 stvx vx,cm49,rd
872 subi rd,rd,64
873 bdnz 1b
874
875 LReverseVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
876 beq 3f // no leftover quadwords
877 mtctr r0
878 2: // loop over 1-3 quadwords
879 lvx v2,cm17,rs
880 subi rs,rs,16
881 vperm vx,v2,v1,vp
882 vor v1,v2,v2 // v1 <- v2
883 stvx vx,cm1,rd
884 subi rd,rd,16
885 bdnz 2b
886 3:
887 mtspr VRSave,rv // restore bitmap of live vr's
888 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
889 blr
890
891
892 // Subroutine to prepare for 64-byte forward vector loops.
893 // Returns many things:
894 // ctr = number of 64-byte chunks to move
895 // r0/cr0 = leftover QWs to move
896 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
897 // cr6 = beq if leftover byte count is 0
898 // c16..c96 loaded
899 // rv = original value of VRSave
900 // NB: c128 not set (if needed), since it is still "ra"
901
902 LPrepareForwardVectors:
903 mfspr rv,VRSave // get bitmap of live vector registers
904 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
905 oris w1,rv,0xFF00 // we use v0-v7
906 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShort16
907 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
908 mtspr VRSave,w1 // update mask
909 li c16,16 // get constants used in ldvx/stvx
910 li c32,32
911 mtctr r0 // set up loop count
912 cmpwi cr6,w3,0 // set cr6 on leftover byte count
913 li c48,48
914 li c96,96
915 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
916 blr
917
918
919 // Subroutine to prepare for 64-byte reverse vector loops.
920 // Returns many things:
921 // ctr = number of 64-byte chunks to move
922 // r0/cr0 = leftover QWs to move
923 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
924 // cr6 = beq if leftover byte count is 0
925 // cm1..cm97 loaded
926 // rv = original value of VRSave
927 // NB: cm129 not set (if needed), since it is still "ra"
928
929 LPrepareReverseVectors:
930 mfspr rv,VRSave // get bitmap of live vector registers
931 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
932 oris w1,rv,0xFF00 // we use v0-v7
933 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
934 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
935 mtspr VRSave,w1 // update mask
936 li cm1,-1 // get constants used in ldvx/stvx
937 li cm17,-17
938 mtctr r0 // set up loop count
939 cmpwi cr6,w3,0 // set cr6 on leftover byte count
940 li cm33,-33
941 li cm49,-49
942 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
943 li cm97,-97
944 blr
945
946
947 // Subroutine to align destination on a 32-byte boundary.
948 // r0 = number of bytes to xfer (0-31)
949
950 LAlign32:
951 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
952 mtcrf 0x02,r0
953 sub rc,rc,r0 // adjust length
954 bf 31,1f // skip if no odd bit
955 lbz w1,0(rs)
956 addi rs,rs,1
957 stb w1,0(rd)
958 addi rd,rd,1
959 1:
960 bf 30,2f // halfword to move?
961 lhz w1,0(rs)
962 addi rs,rs,2
963 sth w1,0(rd)
964 addi rd,rd,2
965 2:
966 bf 29,3f // word?
967 lwz w1,0(rs)
968 addi rs,rs,4
969 stw w1,0(rd)
970 addi rd,rd,4
971 3:
972 bf 28,4f // doubleword?
973 lwz w1,0(rs)
974 lwz w2,4(rs)
975 addi rs,rs,8
976 stw w1,0(rd)
977 stw w2,4(rd)
978 addi rd,rd,8
979 4:
980 bflr 27 // done if no quadword to move
981 lwz w1,0(rs)
982 lwz w2,4(rs)
983 lwz w3,8(rs)
984 lwz w4,12(rs)
985 addi rs,rs,16
986 stw w1,0(rd)
987 stw w2,4(rd)
988 stw w3,8(rd)
989 stw w4,12(rd)
990 addi rd,rd,16
991 blr
992
993 // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
994 // rs and rd still point to low end of operands
995 // we adjust rs and rd to point to last byte moved
996
997 LAlign32Reverse:
998 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
999 add rs,rs,rc
1000 andi. r0,rd,0x1F // r0 <- #bytes that must be moved to align destination
1001 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
1002 mtcrf 0x02,r0
1003 sub rc,rc,r0 // update length
1004 beqlr- // destination already 32-byte aligned
1005
1006 bf 31,1f // odd byte?
1007 lbzu w1,-1(rs)
1008 stbu w1,-1(rd)
1009 1:
1010 bf 30,2f // halfword to move?
1011 lhzu w1,-2(rs)
1012 sthu w1,-2(rd)
1013 2:
1014 bf 29,3f // word?
1015 lwzu w1,-4(rs)
1016 stwu w1,-4(rd)
1017 3:
1018 bf 28,4f // doubleword?
1019 lwz w1,-4(rs)
1020 lwzu w2,-8(rs)
1021 stw w1,-4(rd)
1022 stwu w2,-8(rd
1023 4:
1024 bflr 27 // done if no quadwords
1025 lwz w1,-4(rs)
1026 lwz w2,-8(rs)
1027 lwz w3,-12(rs)
1028 lwzu w4,-16(rs)
1029 stw w1,-4(rd)
1030 stw w2,-8(rd)
1031 stw w3,-12(rd)
1032 stwu w4,-16(rd)
1033 blr
1034
1035
1036 // Subroutine to align destination on an 8-byte boundary for reverse moves.
1037 // rs and rd still point to low end of operands
1038 // we adjust rs and rd to point to last byte moved
1039
1040 LAlign8Reverse:
1041 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
1042 add rs,rs,rc
1043 andi. r0,rd,0x7 // r0 <- #bytes that must be moved to align destination
1044 beqlr- // destination already 8-byte aligned
1045 mtctr r0 // set up for loop
1046 sub rc,rc,r0 // update length
1047 1:
1048 lbzu w1,-1(rs)
1049 stbu w1,-1(rd)
1050 bdnz 1b
1051
1052 blr
1053
1054
1055 // Called by pthread initialization to set up the branch table pointer based on
1056 // the CPU capability vector. This routine may be called more than once (for
1057 // example, during testing.)
1058
1059 // Size of the buffer we use to do DCBA timing on G4:
1060 #define kBufSiz 1024
1061
1062 // Stack frame size, which contains the 128-byte-aligned buffer:
1063 #define kSFSize (kBufSiz+128+16)
1064
1065 // Iterations of the timing loop:
1066 #define kLoopCnt 5
1067
1068 // Bit in cr5 used as a flag in timing loop:
1069 #define kDCBA 22
1070
1071 __bcopy_initialize: // int _bcopy_initialize(void)
1072 mflr ra // get return
1073 stw ra,8(r1) // save
1074 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
1075 addi w6,r1,127+16 // get base address...
1076 rlwinm w6,w6,0,0,24 // ...of our buffer, 128-byte aligned
1077 bcl 20,31,1f // get our PIC base
1078 1:
1079 mflr w1
1080 addis w2,w1,ha16(__cpu_capabilities - 1b)
1081 lwz w3,lo16(__cpu_capabilities - 1b)(w2)
1082 andi. r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
1083 cmpwi r0,kCache32+kHasAltivec // untyped G4?
1084 li w8,0 // assume no need to test
1085 bne 2f // not an untyped G4, so do not test
1086
1087 // G4, but neither kUseDcba or kNoDcba are set. Time and select fastest.
1088
1089 crset kDCBA // first, use DCBA
1090 bl LTest32 // time it
1091 mr w8,w4 // w8 <- best time using DCBA
1092 srwi r0,w8,3 // bias 12 pct in favor of not using DCBA...
1093 add w8,w8,r0 // ...because DCBA is always slower with warm cache
1094 crclr kDCBA
1095 bl LTest32 // w4 <- best time without DCBA
1096 cmplw w8,w4 // which is better?
1097 li w8,kUseDcba // assume using DCBA is faster
1098 blt 2f
1099 li w8,kNoDcba // no DCBA is faster
1100
1101 // What branch table to use?
1102
1103 2: // here with w8 = 0, kUseDcba, or kNoDcba
1104 bcl 20,31,4f // get our PIC base again
1105 4:
1106 mflr w1
1107 addis w2,w1,ha16(__cpu_capabilities - 4b)
1108 lwz w3,lo16(__cpu_capabilities - 4b)(w2)
1109 or w3,w3,w8 // add in kUseDcba or kNoDcba if untyped G4
1110 mr r3,w8 // return dynamic selection, if any (used in testing)
1111
1112 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1113 cmpwi r0,kHasAltivec+kCache32+kUseDcba // G4 with DCBA?
1114 addis w4,w1,ha16(LG4UseDcba - 4b)
1115 addi w4,w4,lo16(LG4UseDcba - 4b)
1116 beq 5f
1117
1118 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1119 cmpwi r0,kHasAltivec+kCache32+kNoDcba // G4 without DCBA?
1120 addis w4,w1,ha16(LG4NoDcba - 4b)
1121 addi w4,w4,lo16(LG4NoDcba - 4b)
1122 beq 5f
1123
1124 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
1125 cmpwi r0,kCache32 // G3?
1126 addis w4,w1,ha16(LG3 - 4b)
1127 addi w4,w4,lo16(LG3 - 4b)
1128 beq 5f
1129
1130 // Map unrecognized CPU types to G3 (lowest common denominator)
1131
1132 5: // w4 <- branch table pointer
1133 addis w5,w1,ha16(LBranchTablePtr - 4b)
1134 stw w4,lo16(LBranchTablePtr - 4b)(w5)
1135 lwz ra,kSFSize+8(r1) // recover return address
1136 mtlr ra // restore it
1137 lwz r1,0(r1) // pop off our stack frame
1138 blr // return dynamic selection (or 0) in r3
1139
1140
1141 // Subroutine to time a 32-byte cache.
1142 // kDCBA = set if we should use DCBA
1143 // w6 = base of buffer to use for test (kBufSiz bytes)
1144 // w4 = we return time of fastest loop in w4
1145
1146 LTest32:
1147 li w1,kLoopCnt // number of times to loop
1148 li w4,-1 // initialize fastest time
1149 1:
1150 mr rd,w6 // initialize buffer ptr
1151 li r0,kBufSiz/32 // r0 <- cache blocks to test
1152 mtctr r0
1153 2:
1154 dcbf 0,rd // first, force the blocks out of the cache
1155 addi rd,rd,32
1156 bdnz 2b
1157 sync // make sure all the flushes take
1158 mr rd,w6 // re-initialize buffer ptr
1159 mtctr r0 // reset cache-block count
1160 mftbu w5 // remember upper half so we can check for carry
1161 mftb w2 // start the timer
1162 3: // loop over cache blocks
1163 bf kDCBA,4f // should we DCBA?
1164 dcba 0,rd
1165 4:
1166 stfd f1,0(rd) // store the entire cache block
1167 stfd f1,8(rd)
1168 stfd f1,16(rd)
1169 stfd f1,24(rd)
1170 addi rd,rd,32
1171 bdnz 3b
1172 mftb w3
1173 mftbu r0
1174 cmpw r0,w5 // did timebase carry?
1175 bne 1b // yes, retest rather than fuss
1176 sub w3,w3,w2 // w3 <- time for this loop
1177 cmplw w3,w4 // faster than current best?
1178 bge 5f // no
1179 mr w4,w3 // remember fastest time through loop
1180 5:
1181 subi w1,w1,1 // decrement outer loop count
1182 cmpwi w1,0 // more to go?
1183 bne 1b // loop if so
1184 blr
1185
1186 #endif /* 0 */