]> git.saurik.com Git - apple/libc.git/blob - ppc/gen/bcopy.s
2020f15afdc88af5bfbbaf4c6e9edd8b7572ca95
[apple/libc.git] / ppc / gen / bcopy.s
1 /*
2 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
25 *
26 * Version of 6/17/2002, for G3, G4, and G4+.
27 *
28 * There are many paths through this code, depending on length, reverse/forward,
29 * processor type, and alignment. We use reverse paths only when the operands
30 * overlap and the destination is higher than the source. They are not quite as
31 * fast as the forward paths.
32 *
33 * Judicious use of DCBTs, just far enough ahead to minimize waiting, is critical in
34 * the inner loops for long operands. DST is less effective than DCBT, because it
35 * can get out of sync with the inner loop. DCBTST is usually not a win, so we
36 * don't use it except during initialization when we're not using the LSU.
37 * We don't DCBT on G3, which only handles one load miss at a time.
38 *
39 * We don't use DCBZ, because it takes an alignment exception on uncached memory
40 * like frame buffers. Bcopy to frame buffers must work. This hurts G3 in the
41 * cold-cache case, but G4 can use DCBA (which does not take alignment exceptions.)
42 *
43 * Using DCBA on G4 is a tradeoff. For the cold-cache case it can be a big win,
44 * since it avoids the read of destination cache lines. But for the hot-cache case
45 * it is always slower, because of the cycles spent needlessly zeroing data. Some
46 * machines store-gather and can cancel the read if all bytes of a line are stored,
47 * others cannot. Unless explicitly told which is better, we time loops with and
48 * without DCBA and use the fastest. Note that we never DCBA in reverse loops,
49 * since by definition they are overlapped so dest lines will be in the cache.
50 *
51 * For longer operands we use an 8-element branch table, based on the CPU type,
52 * to select the appropriate inner loop. The branch table is indexed as follows:
53 *
54 * bit 10000 set if a Reverse move is required
55 * bits 01100 set on the relative operand alignment: 0=unaligned, 1=word,
56 * 2=doubleword, and 3=quadword.
57 *
58 * By "relatively" n-byte aligned, we mean the source and destination are a multiple
59 * of n bytes apart (they need not be absolutely aligned.)
60 *
61 * The branch table for the running CPU type is pointed to by LBranchTablePtr.
62 * Initially, LBranchtablePtr points to G3's table, since that is the lowest
63 * common denominator that will run on any CPU. Later, pthread initialization
64 * sets up the _cpu_capabilities vector and calls _bcopy_initialize, which sets
65 * up the correct pointer for the running CPU.
66 *
67 * We distinguish between "short", "medium", and "long" operands:
68 * short (<= 32 bytes) most common case, minimum path length is important
69 * medium (> 32, < kLong) too short for Altivec or use of cache ops like DCBA
70 * long (>= kLong) long enough for cache ops and to amortize use of Altivec
71 *
72 * WARNING: kLong must be >=96, due to implicit assumptions about operand length.
73 */
74 #define kLong 96
75
76 /* Register usage. Note we use R2, so this code will not run in a PEF/CFM
77 * environment. Note also the rather delicate way we assign multiple uses
78 * to the same register. Beware.
79 *
80 * r0 = "w7" or "r0" (NB: cannot use r0 for any constant such as "c16")
81 * r2 = "w8" or VRSave ("rv")
82 * r3 = not used, as memcpy and memmove return 1st parameter as a value
83 * r4 = source ptr ("rs")
84 * r5 = count of bytes to move ("rc")
85 * r6 = "w1", "c16", or "cm17"
86 * r7 = "w2", "c32", or "cm33"
87 * r8 = "w3", "c48", or "cm49"
88 * r9 = "w4", "c64", or "cm1"
89 * r10 = "w5", "c96", or "cm97"
90 * r11 = "w6", "c128", "cm129", or return address ("ra")
91 * r12 = destination ptr ("rd")
92 * f0-f8 = used for moving 8-byte aligned data
93 * v0 = permute vector ("vp")
94 * v1-v4 = qw's loaded from source ("v1", "v2", "v3", and "v4")
95 * v5-v7 = permuted qw's ("vx", "vy", and "vz")
96 */
97 #define rs r4
98 #define rd r12
99 #define rc r5
100 #define ra r11
101 #define rv r2
102
103 #define w1 r6
104 #define w2 r7
105 #define w3 r8
106 #define w4 r9
107 #define w5 r10
108 #define w6 r11
109 #define w7 r0
110 #define w8 r2
111
112 #define c16 r6
113 #define cm17 r6
114 #define c32 r7
115 #define cm33 r7
116 #define c48 r8
117 #define cm49 r8
118 #define c64 r9
119 #define cm1 r9
120 #define c96 r10
121 #define cm97 r10
122 #define c128 r11
123 #define cm129 r11
124
125 #define vp v0
126 #define vx v5
127 #define vy v6
128 #define vz v7
129
130 #define VRSave 256
131
132 #include <architecture/ppc/asm_help.h>
133
134 // The branch tables, 8 entries per CPU type.
135 // NB: we depend on 5 low-order 0s in the address of branch tables.
136
137 .data
138 .align 5 // must be 32-byte aligned
139
140 // G3 (the default CPU type)
141
142 LG3:
143 .long LForwardWord // 000: forward, unaligned
144 .long LForwardFloat // 001: forward, 4-byte aligned
145 .long LForwardFloat // 010: forward, 8-byte aligned
146 .long LForwardFloat // 011: forward, 16-byte aligned
147 .long LReverseWord // 100: reverse, unaligned
148 .long LReverseFloat // 101: reverse, 4-byte aligned
149 .long LReverseFloat // 110: reverse, 8-byte aligned
150 .long LReverseFloat // 111: reverse, 16-byte aligned
151
152 // G4s that benefit from DCBA.
153
154 LG4UseDcba:
155 .long LForwardVecUnal32Dcba // 000: forward, unaligned
156 .long LForwardVecUnal32Dcba // 001: forward, 4-byte aligned
157 .long LForwardVecUnal32Dcba // 010: forward, 8-byte aligned
158 .long LForwardVecAlig32Dcba // 011: forward, 16-byte aligned
159 .long LReverseVectorUnal32 // 100: reverse, unaligned
160 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
161 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
162 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
163
164 // G4s that should not use DCBA.
165
166 LG4NoDcba:
167 .long LForwardVecUnal32NoDcba // 000: forward, unaligned
168 .long LForwardVecUnal32NoDcba // 001: forward, 4-byte aligned
169 .long LForwardVecUnal32NoDcba // 010: forward, 8-byte aligned
170 .long LForwardVecAlig32NoDcba // 011: forward, 16-byte aligned
171 .long LReverseVectorUnal32 // 100: reverse, unaligned
172 .long LReverseVectorUnal32 // 101: reverse, 4-byte aligned
173 .long LReverseVectorUnal32 // 110: reverse, 8-byte aligned
174 .long LReverseVectorAligned32 // 111: reverse, 16-byte aligned
175
176
177 // Pointer to the 8-element branch table for running CPU type:
178
179 LBranchTablePtr:
180 .long LG3 // default to G3 until "bcopy_initialize" called
181
182
183 // The CPU capability vector, initialized in pthread_init().
184 // "_bcopy_initialize" uses this to set up LBranchTablePtr:
185
186 .globl __cpu_capabilities
187 __cpu_capabilities:
188 .long 0
189
190 // Bit definitions for _cpu_capabilities:
191
192 #define kHasAltivec 0x01
193 #define k64Bit 0x02
194 #define kCache32 0x04
195 #define kCache64 0x08
196 #define kCache128 0x10
197 #define kUseDcba 0x20
198 #define kNoDcba 0x40
199
200
201 .text
202 .globl _bcopy
203 .globl _memcpy
204 .globl _memmove
205 .globl __bcopy_initialize
206
207
208 // Main entry points.
209
210 .align 5
211 _bcopy: // void bcopy(const void *src, void *dst, size_t len)
212 mr r10,r3 // reverse source and dest ptrs, to be like memcpy
213 mr r3,r4
214 mr r4,r10
215 _memcpy: // void* memcpy(void *dst, void *src, size_t len)
216 _memmove: // void* memmove(void *dst, const void *src, size_t len)
217 cmplwi cr7,rc,32 // length <= 32 bytes?
218 sub. w1,r3,rs // must move in reverse if (rd-rs)<rc, set cr0 on sou==dst
219 dcbt 0,rs // touch in the first line of source
220 cmplw cr6,w1,rc // set cr6 blt iff we must move reverse
221 cmplwi cr1,rc,kLong-1 // set cr1 bgt if long
222 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
223 bgt- cr7,LMedium // longer than 32 bytes
224 dcbtst 0,rd // touch in destination
225 beq- cr7,LMove32 // special case moves of 32 bytes
226 blt- cr6,LShortReverse0
227
228 // Forward short operands. This is the most frequent case, so it is inline.
229 // We also end up here to xfer the last 0-31 bytes of longer operands.
230
231 LShort: // WARNING: can fall into this routine
232 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
233 mtcrf 0x01,rc // move rest of length to cr7
234 beq 1f // quadword to move?
235 lwz w1,0(rs)
236 lwz w2,4(rs)
237 lwz w3,8(rs)
238 lwz w4,12(rs)
239 addi rs,rs,16
240 stw w1,0(rd)
241 stw w2,4(rd)
242 stw w3,8(rd)
243 stw w4,12(rd)
244 addi rd,rd,16
245 1:
246 LShort16: // join here to xfer 0-15 bytes
247 bf 28,2f // doubleword?
248 lwz w1,0(rs)
249 lwz w2,4(rs)
250 addi rs,rs,8
251 stw w1,0(rd)
252 stw w2,4(rd)
253 addi rd,rd,8
254 2:
255 bf 29,3f // word?
256 lwz w1,0(rs)
257 addi rs,rs,4
258 stw w1,0(rd)
259 addi rd,rd,4
260 3:
261 bf 30,4f // halfword to move?
262 lhz w1,0(rs)
263 addi rs,rs,2
264 sth w1,0(rd)
265 addi rd,rd,2
266 4:
267 bflr 31 // skip if no odd byte
268 lbz w1,0(rs)
269 stb w1,0(rd)
270 blr
271
272
273 // Handle short reverse operands, up to kShort in length.
274 // This is also used to transfer the last 0-31 bytes of longer operands.
275
276 LShortReverse0:
277 add rs,rs,rc // adjust ptrs for reverse move
278 add rd,rd,rc
279 LShortReverse:
280 andi. r0,rc,0x10 // test bit 27 separately (sometimes faster than a mtcrf)
281 mtcrf 0x01,rc // move rest of length to cr7
282 beq 1f // quadword to move?
283 lwz w1,-4(rs)
284 lwz w2,-8(rs)
285 lwz w3,-12(rs)
286 lwzu w4,-16(rs)
287 stw w1,-4(rd)
288 stw w2,-8(rd)
289 stw w3,-12(rd)
290 stwu w4,-16(rd)
291 1:
292 LShortReverse16: // join here to xfer 0-15 bytes and return
293 bf 28,2f // doubleword?
294 lwz w1,-4(rs)
295 lwzu w2,-8(rs)
296 stw w1,-4(rd)
297 stwu w2,-8(rd
298 2:
299 bf 29,3f // word?
300 lwzu w1,-4(rs)
301 stwu w1,-4(rd)
302 3:
303 bf 30,4f // halfword to move?
304 lhzu w1,-2(rs)
305 sthu w1,-2(rd)
306 4:
307 bflr 31 // done if no odd byte
308 lbz w1,-1(rs) // no update
309 stb w1,-1(rd)
310 blr
311
312
313 // Special case for 32-byte moves. Too long for LShort, too common for LMedium.
314
315 LMove32:
316 lwz w1,0(rs)
317 lwz w2,4(rs)
318 lwz w3,8(rs)
319 lwz w4,12(rs)
320 lwz w5,16(rs)
321 lwz w6,20(rs)
322 lwz w7,24(rs)
323 lwz w8,28(rs)
324 stw w1,0(rd)
325 stw w2,4(rd)
326 stw w3,8(rd)
327 stw w4,12(rd)
328 stw w5,16(rd)
329 stw w6,20(rd)
330 stw w7,24(rd)
331 stw w8,28(rd)
332 LExit:
333 blr
334
335
336 // Medium length operands (32 < rc < kLong.) These loops run on all CPUs, as the
337 // operands are not long enough to bother with the branch table, using cache ops, or
338 // Altivec. We word align the source, not the dest as we do for long operands,
339 // since doing so is faster on G4+ and probably beyond, we never DCBA on medium-length
340 // operands, and the opportunity to cancel reads of dest cache lines is limited.
341 // w1 = (rd-rs), used to check for alignment
342 // cr0 = set on (rd-rs)
343 // cr1 = bgt if long operand
344 // cr6 = blt if reverse move
345
346 LMedium:
347 dcbtst 0,rd // touch in 1st line of destination
348 rlwinm r0,w1,0,29,31 // r0 <- ((rd-rs) & 7), ie 0 if doubleword aligned
349 beq- LExit // early exit if (rs==rd), avoiding use of "beqlr"
350 neg w2,rs // we align source, not dest, and assume forward
351 cmpwi cr5,r0,0 // set cr5 beq if doubleword aligned
352 bgt- cr1,LLong // handle long operands
353 andi. w3,w2,3 // W3 <- #bytes to word-align source
354 blt- cr6,LMediumReverse // handle reverse move
355 lwz w1,0(rs) // pre-fetch first 4 bytes of source
356 beq- cr5,LMediumAligned // operands are doubleword aligned
357 sub rc,rc,w3 // adjust count for alignment
358 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShort16
359 srwi w4,rc,4 // w4 <- number of 16-byte chunks to xfer (>=1)
360 mtctr w4 // prepare loop count
361 beq+ 2f // source already aligned
362
363 lwzx w2,w3,rs // get 1st aligned word (which we might partially overwrite)
364 add rs,rs,w3 // word-align source ptr
365 stw w1,0(rd) // store all (w3) bytes at once to avoid a loop
366 add rd,rd,w3
367 mr w1,w2 // first aligned word to w1
368 b 2f
369
370 .align 4 // align inner loops
371 1: // loop over 16-byte chunks
372 lwz w1,0(rs)
373 2:
374 lwz w2,4(rs)
375 lwz w3,8(rs)
376 lwz w4,12(rs)
377 addi rs,rs,16
378 stw w1,0(rd)
379 stw w2,4(rd)
380 stw w3,8(rd)
381 stw w4,12(rd)
382 addi rd,rd,16
383 bdnz 1b
384
385 b LShort16
386
387
388 // Medium, doubleword aligned. We use floating point. Note that G4+ has bigger latencies
389 // and reduced throughput for floating pt loads and stores; future processors will probably
390 // have even worse lfd/stfd performance. We use it here because it is so important for G3,
391 // and not slower for G4+. But we only do so for doubleword aligned operands, whereas the
392 // G3-only long operand loops use floating pt even for word-aligned operands.
393 // w2 = neg(rs)
394 // w1 = first 4 bytes of source
395
396 LMediumAligned:
397 andi. w3,w2,7 // already aligned?
398 sub rc,rc,w3 // adjust count by 0-7 bytes
399 lfdx f0,rs,w3 // pre-fetch first aligned source doubleword
400 srwi w4,rc,5 // get count of 32-byte chunks (might be 0 if unaligned)
401 mtctr w4
402 beq- LForwardFloatLoop1 // already aligned
403
404 cmpwi w4,0 // are there any 32-byte chunks to xfer?
405 lwz w2,4(rs) // get 2nd (unaligned) source word
406 add rs,rs,w3 // doubleword align source pointer
407 stw w1,0(rd) // store first 8 bytes of source to align...
408 stw w2,4(rd) // ...which could overwrite source
409 add rd,rd,w3 // doubleword align destination
410 bne+ LForwardFloatLoop1 // at least 1 chunk, so enter loop
411
412 subi rc,rc,8 // unfortunate degenerate case: no chunks to xfer
413 stfd f0,0(rd) // must store f1 since source might have been overwriten
414 addi rs,rs,8
415 addi rd,rd,8
416 b LShort
417
418
419 // Medium reverse moves. This loop runs on all processors.
420
421 LMediumReverse:
422 add rs,rs,rc // point to other end of operands when in reverse
423 add rd,rd,rc
424 andi. w3,rs,3 // w3 <- #bytes to word align source
425 lwz w1,-4(rs) // pre-fetch 1st 4 bytes of source
426 sub rc,rc,w3 // adjust count
427 srwi w4,rc,4 // get count of 16-byte chunks (>=1)
428 mtcrf 0x01,rc // remaining byte count (0-15) to cr7 for LShortReverse16
429 mtctr w4 // prepare loop count
430 beq+ 2f // source already aligned
431
432 sub rs,rs,w3 // word-align source ptr
433 lwz w2,-4(rs) // get 1st aligned word which we may overwrite
434 stw w1,-4(rd) // store all 4 bytes to align without a loop
435 sub rd,rd,w3
436 mr w1,w2 // shift 1st aligned source word to w1
437 b 2f
438
439 1:
440 lwz w1,-4(rs)
441 2:
442 lwz w2,-8(rs)
443 lwz w3,-12(rs)
444 lwzu w4,-16(rs)
445 stw w1,-4(rd)
446 stw w2,-8(rd)
447 stw w3,-12(rd)
448 stwu w4,-16(rd)
449 bdnz 1b
450
451 b LShortReverse16
452
453
454 // Long operands. Use branch table to decide which loop to use.
455 // w1 = (rd-rs), used to determine alignment
456
457 LLong:
458 xor w4,w1,rc // we must move reverse if (rd-rs)<rc
459 mflr ra // save return address
460 rlwinm w5,w1,1,27,30 // w5 <- ((w1 & 0xF) << 1)
461 bcl 20,31,1f // use reserved form to get our location
462 1:
463 mflr w3 // w3 == addr(1b)
464 lis w8,0x0408 // load a 16 element, 2-bit array into w8...
465 cntlzw w4,w4 // find first difference between (rd-rs) and rc
466 addis w2,w3,ha16(LBranchTablePtr-1b)
467 ori w8,w8,0x040C // ...used to map w5 to alignment encoding (ie, to 0-3)
468 lwz w2,lo16(LBranchTablePtr-1b)(w2) // w2 <- branch table address
469 slw w4,rc,w4 // bit 0 of w4 set iff (rd-rs)<rc
470 rlwnm w5,w8,w5,28,29 // put alignment encoding in bits 01100 of w5
471 rlwimi w2,w4,5,27,27 // put reverse bit in bit 10000 of branch table address
472 lwzx w3,w2,w5 // w3 <- load loop address from branch table
473 neg w1,rd // start to compute destination alignment
474 mtctr w3
475 andi. r0,w1,0x1F // r0 <- bytes req'd to 32-byte align dest (if forward move)
476 bctr // NB: r0/cr0 and w1 are passed as parameters
477
478
479 // G3, forward, long, unaligned.
480 // w1 = neg(rd)
481
482 LForwardWord:
483 andi. w3,w1,3 // W3 <- #bytes to word-align destination
484 mtlr ra // restore return address
485 sub rc,rc,w3 // adjust count for alignment
486 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
487 mtctr r0 // prepare loop count
488 beq+ 1f // dest already aligned
489
490 lwz w2,0(rs) // get first 4 bytes of source
491 lwzx w1,w3,rs // get source bytes we might overwrite
492 add rs,rs,w3 // adjust source ptr
493 stw w2,0(rd) // store all 4 bytes to avoid a loop
494 add rd,rd,w3 // word-align destination
495 b 2f
496 1:
497 lwz w1,0(rs)
498 2:
499 lwz w2,4(rs)
500 lwz w3,8(rs)
501 lwz w4,12(rs)
502 lwz w5,16(rs)
503 lwz w6,20(rs)
504 lwz w7,24(rs)
505 lwz w8,28(rs)
506 addi rs,rs,32
507 stw w1,0(rd)
508 stw w2,4(rd)
509 stw w3,8(rd)
510 stw w4,12(rd)
511 stw w5,16(rd)
512 stw w6,20(rd)
513 stw w7,24(rd)
514 stw w8,28(rd)
515 addi rd,rd,32
516 bdnz 1b
517
518 b LShort
519
520
521 // G3, forward, long, word aligned. We use floating pt even when only word aligned.
522 // w1 = neg(rd)
523
524 LForwardFloat:
525 andi. w3,w1,7 // W3 <- #bytes to doubleword-align destination
526 mtlr ra // restore return address
527 sub rc,rc,w3 // adjust count for alignment
528 srwi r0,rc,5 // number of 32-byte chunks to xfer (>=1)
529 mtctr r0 // prepare loop count
530 beq LForwardFloatLoop // dest already aligned
531
532 lwz w1,0(rs) // get first 8 bytes of source
533 lwz w2,4(rs)
534 lfdx f0,w3,rs // get source bytes we might overwrite
535 add rs,rs,w3 // word-align source ptr
536 stw w1,0(rd) // store all 8 bytes to avoid a loop
537 stw w2,4(rd)
538 add rd,rd,w3
539 b LForwardFloatLoop1
540
541 .align 4 // align since this loop is executed by G4s too
542 LForwardFloatLoop:
543 lfd f0,0(rs)
544 LForwardFloatLoop1: // enter here from LMediumAligned and above
545 lfd f1,8(rs)
546 lfd f2,16(rs)
547 lfd f3,24(rs)
548 addi rs,rs,32
549 stfd f0,0(rd)
550 stfd f1,8(rd)
551 stfd f2,16(rd)
552 stfd f3,24(rd)
553 addi rd,rd,32
554 bdnz LForwardFloatLoop
555
556 b LShort
557
558
559 // G4 Forward, long, 16-byte aligned, 32-byte cache ops, use DCBA and DCBT.
560 // r0/cr0 = #bytes to 32-byte align
561
562 LForwardVecAlig32Dcba:
563 bnel+ LAlign32 // align destination iff necessary
564 bl LPrepareForwardVectors
565 mtlr ra // restore return address before loading c128
566 li c128,128
567 b 1f // enter aligned loop
568
569 .align 5 // long loop heads should be at least 16-byte aligned
570 1: // loop over aligned 64-byte chunks
571 dcbt c96,rs // pre-fetch three cache lines ahead
572 dcbt c128,rs // and four
573 lvx v1,0,rs
574 lvx v2,c16,rs
575 lvx v3,c32,rs
576 lvx v4,c48,rs
577 addi rs,rs,64
578 dcba 0,rd // avoid read of destination cache lines
579 stvx v1,0,rd
580 stvx v2,c16,rd
581 dcba c32,rd
582 stvx v3,c32,rd
583 stvx v4,c48,rd
584 addi rd,rd,64
585 bdnz 1b
586
587 LForwardVectorAlignedEnd: // r0/cr0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
588 beq- 3f // no leftover quadwords
589 mtctr r0
590 2: // loop over remaining quadwords (1-7)
591 lvx v1,0,rs
592 addi rs,rs,16
593 stvx v1,0,rd
594 addi rd,rd,16
595 bdnz 2b
596 3:
597 mtspr VRSave,rv // restore bitmap of live vr's
598 bne cr6,LShort16 // handle last 0-15 bytes if any
599 blr
600
601
602 // G4 Forward, long, 16-byte aligned, 32-byte cache, use DCBT but not DCBA.
603 // r0/cr0 = #bytes to 32-byte align
604
605 LForwardVecAlig32NoDcba:
606 bnel+ LAlign32 // align destination iff necessary
607 bl LPrepareForwardVectors
608 mtlr ra // restore return address before loading c128
609 li c128,128
610 b 1f // enter aligned loop
611
612 .align 4 // balance 13-word loop between QWs...
613 nop // ...which improves performance 5% +/-
614 nop
615 1: // loop over aligned 64-byte chunks
616 dcbt c96,rs // pre-fetch three cache lines ahead
617 dcbt c128,rs // and four
618 lvx v1,0,rs
619 lvx v2,c16,rs
620 lvx v3,c32,rs
621 lvx v4,c48,rs
622 addi rs,rs,64
623 stvx v1,0,rd
624 stvx v2,c16,rd
625 stvx v3,c32,rd
626 stvx v4,c48,rd
627 addi rd,rd,64
628 bdnz 1b
629
630 b LForwardVectorAlignedEnd
631
632
633 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT and DCBA. At least on
634 // some CPUs, this routine is no slower than the simpler aligned version that does
635 // not use permutes. But it cannot be used with aligned operands, because of the
636 // way it prefetches source QWs.
637 // r0/cr0 = #bytes to 32-byte align
638
639 LForwardVecUnal32Dcba:
640 bnel+ LAlign32 // align destination iff necessary
641 bl LPrepareForwardVectors
642 lvx v1,0,rs // prime loop
643 mtlr ra // restore return address before loading c128
644 lvsl vp,0,rs // get permute vector to shift left
645 li c128,128
646 b 1f // enter aligned loop
647
648 .align 4 // long loop heads should be at least 16-byte aligned
649 1: // loop over aligned 64-byte destination chunks
650 lvx v2,c16,rs
651 dcbt c96,rs // touch 3rd cache line ahead
652 lvx v3,c32,rs
653 dcbt c128,rs // touch 4th cache line ahead
654 lvx v4,c48,rs
655 addi rs,rs,64
656 vperm vx,v1,v2,vp
657 lvx v1,0,rs
658 vperm vy,v2,v3,vp
659 dcba 0,rd // avoid read of destination lines
660 stvx vx,0,rd
661 vperm vz,v3,v4,vp
662 stvx vy,c16,rd
663 dcba c32,rd
664 vperm vx,v4,v1,vp
665 stvx vz,c32,rd
666 stvx vx,c48,rd
667 addi rd,rd,64
668 bdnz 1b
669
670 LForwardVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=next QW, cr7=(rc & F), cr6 set on cr7
671 beq- 3f // no leftover quadwords
672 mtctr r0
673 2: // loop over remaining quadwords
674 lvx v2,c16,rs
675 addi rs,rs,16
676 vperm vx,v1,v2,vp
677 vor v1,v2,v2 // v1 <- v2
678 stvx vx,0,rd
679 addi rd,rd,16
680 bdnz 2b
681 3:
682 mtspr VRSave,rv // restore bitmap of live vr's
683 bne cr6,LShort16 // handle last 0-15 bytes if any
684 blr
685
686
687 // G4 Forward, long, unaligned, 32-byte cache ops, use DCBT but not DCBA.
688 // r0/cr0 = #bytes to 32-byte align
689
690 LForwardVecUnal32NoDcba:
691 bnel+ LAlign32 // align destination iff necessary
692 bl LPrepareForwardVectors
693 lvx v1,0,rs // prime loop
694 mtlr ra // restore return address before loading c128
695 lvsl vp,0,rs // get permute vector to shift left
696 li c128,128
697 b 1f // enter aligned loop
698
699 .align 4
700 nop // balance 17-word loop between QWs
701 nop
702 1: // loop over aligned 64-byte destination chunks
703 lvx v2,c16,rs
704 dcbt c96,rs // touch 3rd cache line ahead
705 lvx v3,c32,rs
706 dcbt c128,rs // touch 4th cache line ahead
707 lvx v4,c48,rs
708 addi rs,rs,64
709 vperm vx,v1,v2,vp
710 lvx v1,0,rs
711 vperm vy,v2,v3,vp
712 stvx vx,0,rd
713 vperm vz,v3,v4,vp
714 stvx vy,c16,rd
715 vperm vx,v4,v1,vp
716 stvx vz,c32,rd
717 stvx vx,c48,rd
718 addi rd,rd,64
719 bdnz 1b
720
721 b LForwardVectorUnalignedEnd
722
723
724 // G3 Reverse, long, unaligned.
725
726 LReverseWord:
727 bl LAlign8Reverse // 8-byte align destination
728 mtlr ra // restore return address
729 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
730 mtctr r0
731 1:
732 lwz w1,-4(rs)
733 lwz w2,-8(rs)
734 lwz w3,-12(rs)
735 lwz w4,-16(rs)
736 stw w1,-4(rd)
737 lwz w5,-20(rs)
738 stw w2,-8(rd)
739 lwz w6,-24(rs)
740 stw w3,-12(rd)
741 lwz w7,-28(rs)
742 stw w4,-16(rd)
743 lwzu w8,-32(rs)
744 stw w5,-20(rd)
745 stw w6,-24(rd)
746 stw w7,-28(rd)
747 stwu w8,-32(rd)
748 bdnz 1b
749
750 b LShortReverse
751
752
753 // G3 Reverse, long, word aligned.
754
755 LReverseFloat:
756 bl LAlign8Reverse // 8-byte align
757 mtlr ra // restore return address
758 srwi r0,rc,5 // get count of 32-byte chunks to xfer (> 1)
759 mtctr r0
760 1:
761 lfd f0,-8(rs)
762 lfd f1,-16(rs)
763 lfd f2,-24(rs)
764 lfdu f3,-32(rs)
765 stfd f0,-8(rd)
766 stfd f1,-16(rd)
767 stfd f2,-24(rd)
768 stfdu f3,-32(rd)
769 bdnz 1b
770
771 b LShortReverse
772
773
774 // G4 Reverse, long, 16-byte aligned, 32-byte DCBT but no DCBA.
775
776 LReverseVectorAligned32:
777 bl LAlign32Reverse // 32-byte align destination iff necessary
778 bl LPrepareReverseVectors
779 mtlr ra // restore return address before loading cm129
780 li cm129,-129
781 b 1f // enter aligned loop
782
783 .align 4
784 nop // must start in 3rd word of QW...
785 nop // ...to keep balanced
786 1: // loop over aligned 64-byte chunks
787 dcbt cm97,rs // pre-fetch three cache lines ahead
788 dcbt cm129,rs // and four
789 lvx v1,cm1,rs
790 lvx v2,cm17,rs
791 lvx v3,cm33,rs
792 lvx v4,cm49,rs
793 subi rs,rs,64
794 stvx v1,cm1,rd
795 stvx v2,cm17,rd
796 stvx v3,cm33,rd
797 stvx v4,cm49,rd
798 subi rd,rd,64
799 bdnz 1b
800
801 LReverseVectorAlignedEnd: // cr0/r0=#quadwords, rv=VRSave, cr7=low 4 bits of rc, cr6 set on cr7
802 beq 3f // no leftover quadwords
803 mtctr r0
804 2: // loop over 1-3 quadwords
805 lvx v1,cm1,rs
806 subi rs,rs,16
807 stvx v1,cm1,rd
808 subi rd,rd,16
809 bdnz 2b
810 3:
811 mtspr VRSave,rv // restore bitmap of live vr's
812 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
813 blr
814
815
816 // G4 Reverse, long, unaligned, 32-byte DCBT.
817
818 LReverseVectorUnal32:
819 bl LAlign32Reverse // align destination iff necessary
820 bl LPrepareReverseVectors
821 lvx v1,cm1,rs // prime loop
822 mtlr ra // restore return address before loading cm129
823 lvsl vp,0,rs // get permute vector to shift left
824 li cm129,-129
825 b 1f // enter aligned loop
826
827 .align 4
828 nop // start loop in 3rd word on QW to balance
829 nop
830 1: // loop over aligned 64-byte destination chunks
831 lvx v2,cm17,rs
832 dcbt cm97,rs // touch in 3rd source block
833 lvx v3,cm33,rs
834 dcbt cm129,rs // touch in 4th
835 lvx v4,cm49,rs
836 subi rs,rs,64
837 vperm vx,v2,v1,vp
838 lvx v1,cm1,rs
839 vperm vy,v3,v2,vp
840 stvx vx,cm1,rd
841 vperm vz,v4,v3,vp
842 stvx vy,cm17,rd
843 vperm vx,v1,v4,vp
844 stvx vz,cm33,rd
845 stvx vx,cm49,rd
846 subi rd,rd,64
847 bdnz 1b
848
849 LReverseVectorUnalignedEnd: // r0/cr0=#QWs, rv=VRSave, v1=source QW, cr7=low 4 bits of rc, cr6 set on cr7
850 beq 3f // no leftover quadwords
851 mtctr r0
852 2: // loop over 1-3 quadwords
853 lvx v2,cm17,rs
854 subi rs,rs,16
855 vperm vx,v2,v1,vp
856 vor v1,v2,v2 // v1 <- v2
857 stvx vx,cm1,rd
858 subi rd,rd,16
859 bdnz 2b
860 3:
861 mtspr VRSave,rv // restore bitmap of live vr's
862 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
863 blr
864
865
866 // Subroutine to prepare for 64-byte forward vector loops.
867 // Returns many things:
868 // ctr = number of 64-byte chunks to move
869 // r0/cr0 = leftover QWs to move
870 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
871 // cr6 = beq if leftover byte count is 0
872 // c16..c96 loaded
873 // rv = original value of VRSave
874 // NB: c128 not set (if needed), since it is still "ra"
875
876 LPrepareForwardVectors:
877 mfspr rv,VRSave // get bitmap of live vector registers
878 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
879 oris w1,rv,0xFF00 // we use v0-v7
880 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShort16
881 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
882 mtspr VRSave,w1 // update mask
883 li c16,16 // get constants used in ldvx/stvx
884 li c32,32
885 mtctr r0 // set up loop count
886 cmpwi cr6,w3,0 // set cr6 on leftover byte count
887 li c48,48
888 li c96,96
889 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
890 blr
891
892
893 // Subroutine to prepare for 64-byte reverse vector loops.
894 // Returns many things:
895 // ctr = number of 64-byte chunks to move
896 // r0/cr0 = leftover QWs to move
897 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
898 // cr6 = beq if leftover byte count is 0
899 // cm1..cm97 loaded
900 // rv = original value of VRSave
901 // NB: cm129 not set (if needed), since it is still "ra"
902
903 LPrepareReverseVectors:
904 mfspr rv,VRSave // get bitmap of live vector registers
905 srwi r0,rc,6 // get count of 64-byte chunks to move (>=1)
906 oris w1,rv,0xFF00 // we use v0-v7
907 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
908 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
909 mtspr VRSave,w1 // update mask
910 li cm1,-1 // get constants used in ldvx/stvx
911 li cm17,-17
912 mtctr r0 // set up loop count
913 cmpwi cr6,w3,0 // set cr6 on leftover byte count
914 li cm33,-33
915 li cm49,-49
916 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
917 li cm97,-97
918 blr
919
920
921 // Subroutine to align destination on a 32-byte boundary.
922 // r0 = number of bytes to xfer (0-31)
923
924 LAlign32:
925 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
926 mtcrf 0x02,r0
927 sub rc,rc,r0 // adjust length
928 bf 31,1f // skip if no odd bit
929 lbz w1,0(rs)
930 addi rs,rs,1
931 stb w1,0(rd)
932 addi rd,rd,1
933 1:
934 bf 30,2f // halfword to move?
935 lhz w1,0(rs)
936 addi rs,rs,2
937 sth w1,0(rd)
938 addi rd,rd,2
939 2:
940 bf 29,3f // word?
941 lwz w1,0(rs)
942 addi rs,rs,4
943 stw w1,0(rd)
944 addi rd,rd,4
945 3:
946 bf 28,4f // doubleword?
947 lwz w1,0(rs)
948 lwz w2,4(rs)
949 addi rs,rs,8
950 stw w1,0(rd)
951 stw w2,4(rd)
952 addi rd,rd,8
953 4:
954 bflr 27 // done if no quadword to move
955 lwz w1,0(rs)
956 lwz w2,4(rs)
957 lwz w3,8(rs)
958 lwz w4,12(rs)
959 addi rs,rs,16
960 stw w1,0(rd)
961 stw w2,4(rd)
962 stw w3,8(rd)
963 stw w4,12(rd)
964 addi rd,rd,16
965 blr
966
967 // Subroutine to align destination if necessary on a 32-byte boundary for reverse moves.
968 // rs and rd still point to low end of operands
969 // we adjust rs and rd to point to last byte moved
970
971 LAlign32Reverse:
972 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
973 add rs,rs,rc
974 andi. r0,rd,0x1F // r0 <- #bytes that must be moved to align destination
975 mtcrf 0x01,r0 // length to cr (faster to change 1 CR at a time)
976 mtcrf 0x02,r0
977 sub rc,rc,r0 // update length
978 beqlr- // destination already 32-byte aligned
979
980 bf 31,1f // odd byte?
981 lbzu w1,-1(rs)
982 stbu w1,-1(rd)
983 1:
984 bf 30,2f // halfword to move?
985 lhzu w1,-2(rs)
986 sthu w1,-2(rd)
987 2:
988 bf 29,3f // word?
989 lwzu w1,-4(rs)
990 stwu w1,-4(rd)
991 3:
992 bf 28,4f // doubleword?
993 lwz w1,-4(rs)
994 lwzu w2,-8(rs)
995 stw w1,-4(rd)
996 stwu w2,-8(rd
997 4:
998 bflr 27 // done if no quadwords
999 lwz w1,-4(rs)
1000 lwz w2,-8(rs)
1001 lwz w3,-12(rs)
1002 lwzu w4,-16(rs)
1003 stw w1,-4(rd)
1004 stw w2,-8(rd)
1005 stw w3,-12(rd)
1006 stwu w4,-16(rd)
1007 blr
1008
1009
1010 // Subroutine to align destination on an 8-byte boundary for reverse moves.
1011 // rs and rd still point to low end of operands
1012 // we adjust rs and rd to point to last byte moved
1013
1014 LAlign8Reverse:
1015 add rd,rd,rc // point to last byte moved (ie, 1 past end of operands)
1016 add rs,rs,rc
1017 andi. r0,rd,0x7 // r0 <- #bytes that must be moved to align destination
1018 beqlr- // destination already 8-byte aligned
1019 mtctr r0 // set up for loop
1020 sub rc,rc,r0 // update length
1021 1:
1022 lbzu w1,-1(rs)
1023 stbu w1,-1(rd)
1024 bdnz 1b
1025
1026 blr
1027
1028
1029 // Called by pthread initialization to set up the branch table pointer based on
1030 // the CPU capability vector. This routine may be called more than once (for
1031 // example, during testing.)
1032
1033 // Size of the buffer we use to do DCBA timing on G4:
1034 #define kBufSiz 1024
1035
1036 // Stack frame size, which contains the 128-byte-aligned buffer:
1037 #define kSFSize (kBufSiz+128+16)
1038
1039 // Iterations of the timing loop:
1040 #define kLoopCnt 5
1041
1042 // Bit in cr5 used as a flag in timing loop:
1043 #define kDCBA 22
1044
1045 __bcopy_initialize: // int _bcopy_initialize(void)
1046 mflr ra // get return
1047 stw ra,8(r1) // save
1048 stwu r1,-kSFSize(r1) // carve our temp buffer from the stack
1049 addi w6,r1,127+16 // get base address...
1050 rlwinm w6,w6,0,0,24 // ...of our buffer, 128-byte aligned
1051 bcl 20,31,1f // get our PIC base
1052 1:
1053 mflr w1
1054 addis w2,w1,ha16(__cpu_capabilities - 1b)
1055 lwz w3,lo16(__cpu_capabilities - 1b)(w2)
1056 andi. r0,w3,kUseDcba+kNoDcba+kCache32+k64Bit+kHasAltivec
1057 cmpwi r0,kCache32+kHasAltivec // untyped G4?
1058 li w8,0 // assume no need to test
1059 bne 2f // not an untyped G4, so do not test
1060
1061 // G4, but neither kUseDcba or kNoDcba are set. Time and select fastest.
1062
1063 crset kDCBA // first, use DCBA
1064 bl LTest32 // time it
1065 mr w8,w4 // w8 <- best time using DCBA
1066 srwi r0,w8,3 // bias 12 pct in favor of not using DCBA...
1067 add w8,w8,r0 // ...because DCBA is always slower with warm cache
1068 crclr kDCBA
1069 bl LTest32 // w4 <- best time without DCBA
1070 cmplw w8,w4 // which is better?
1071 li w8,kUseDcba // assume using DCBA is faster
1072 blt 2f
1073 li w8,kNoDcba // no DCBA is faster
1074
1075 // What branch table to use?
1076
1077 2: // here with w8 = 0, kUseDcba, or kNoDcba
1078 bcl 20,31,4f // get our PIC base again
1079 4:
1080 mflr w1
1081 addis w2,w1,ha16(__cpu_capabilities - 4b)
1082 lwz w3,lo16(__cpu_capabilities - 4b)(w2)
1083 or w3,w3,w8 // add in kUseDcba or kNoDcba if untyped G4
1084 mr r3,w8 // return dynamic selection, if any (used in testing)
1085
1086 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1087 cmpwi r0,kHasAltivec+kCache32+kUseDcba // G4 with DCBA?
1088 addis w4,w1,ha16(LG4UseDcba - 4b)
1089 addi w4,w4,lo16(LG4UseDcba - 4b)
1090 beq 5f
1091
1092 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32+kUseDcba+kNoDcba
1093 cmpwi r0,kHasAltivec+kCache32+kNoDcba // G4 without DCBA?
1094 addis w4,w1,ha16(LG4NoDcba - 4b)
1095 addi w4,w4,lo16(LG4NoDcba - 4b)
1096 beq 5f
1097
1098 andi. r0,w3,kHasAltivec+k64Bit+kCache128+kCache64+kCache32
1099 cmpwi r0,kCache32 // G3?
1100 addis w4,w1,ha16(LG3 - 4b)
1101 addi w4,w4,lo16(LG3 - 4b)
1102 beq 5f
1103
1104 // Map unrecognized CPU types to G3 (lowest common denominator)
1105
1106 5: // w4 <- branch table pointer
1107 addis w5,w1,ha16(LBranchTablePtr - 4b)
1108 stw w4,lo16(LBranchTablePtr - 4b)(w5)
1109 lwz ra,kSFSize+8(r1) // recover return address
1110 mtlr ra // restore it
1111 lwz r1,0(r1) // pop off our stack frame
1112 blr // return dynamic selection (or 0) in r3
1113
1114
1115 // Subroutine to time a 32-byte cache.
1116 // kDCBA = set if we should use DCBA
1117 // w6 = base of buffer to use for test (kBufSiz bytes)
1118 // w4 = we return time of fastest loop in w4
1119
1120 LTest32:
1121 li w1,kLoopCnt // number of times to loop
1122 li w4,-1 // initialize fastest time
1123 1:
1124 mr rd,w6 // initialize buffer ptr
1125 li r0,kBufSiz/32 // r0 <- cache blocks to test
1126 mtctr r0
1127 2:
1128 dcbf 0,rd // first, force the blocks out of the cache
1129 addi rd,rd,32
1130 bdnz 2b
1131 sync // make sure all the flushes take
1132 mr rd,w6 // re-initialize buffer ptr
1133 mtctr r0 // reset cache-block count
1134 mftbu w5 // remember upper half so we can check for carry
1135 mftb w2 // start the timer
1136 3: // loop over cache blocks
1137 bf kDCBA,4f // should we DCBA?
1138 dcba 0,rd
1139 4:
1140 stfd f1,0(rd) // store the entire cache block
1141 stfd f1,8(rd)
1142 stfd f1,16(rd)
1143 stfd f1,24(rd)
1144 addi rd,rd,32
1145 bdnz 3b
1146 mftb w3
1147 mftbu r0
1148 cmpw r0,w5 // did timebase carry?
1149 bne 1b // yes, retest rather than fuss
1150 sub w3,w3,w2 // w3 <- time for this loop
1151 cmplw w3,w4 // faster than current best?
1152 bge 5f // no
1153 mr w4,w3 // remember fastest time through loop
1154 5:
1155 subi w1,w1,1 // decrement outer loop count
1156 cmpwi w1,0 // more to go?
1157 bne 1b // loop if so
1158 blr
1159