2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
31 ; Copy bytes of data around. Handles overlapped data.
35 #include <ppc/proc_reg.h>
38 ; These routines use CR5 for certain flags:
39 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
43 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
44 #define BCOPY_SF_SIZE 32 // total size
45 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
48 #define kShort 32 // short operands are special cased
51 ; void bcopy_physvir_32(from, to, nbytes)
53 ; Attempt to copy physically addressed memory with translation on if conditions are met.
54 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
55 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
56 ; for the passed phys addrs and do the copy with translation on.
58 ; Rules are: - neither source nor destination can cross a page.
59 ; - Interrupts must be disabled when this routine is called.
60 ; - Translation must be on when called.
62 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
63 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
64 ; address is the same as the virtual address of the instructions we are executing.
66 ; At the end, we invalidate the used DBATs.
68 ; Note that the address parameters are long longs. We will transform these to 64-bit
69 ; values. Note that on 32-bit architectures that this will ignore the high half of the
70 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
73 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
74 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
75 ; for an example of how this is done.
78 .globl EXT(bcopy_physvir_32)
80 LEXT(bcopy_physvir_32)
81 mflr r0 ; get return address
82 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
83 mfsprg r8,2 ; get processor feature flags
84 stw r0,8(r1) ; save return address
85 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
86 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
87 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
88 subi r0,r7,1 ; get length - 1
89 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
90 add r11,r3,r0 ; Point to last byte of sink
91 mr r5,r7 ; Get the length into the right register
92 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
94 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
95 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
97 add r12,r4,r0 ; Point to last byte of source
98 xor r7,r11,r3 ; See if we went to next page
99 xor r8,r12,r4 ; See if we went to next page
100 or r0,r7,r8 ; Combine wrap
102 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
103 li r9,((2<<3)|2) ; Set default attributes
104 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
105 li r7,2 ; Set validity flags
106 li r8,2 ; Set validity flags
107 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
109 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
110 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
111 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
112 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
113 cmplw cr1,r11,r12 ; See if sink and source are same block
117 mtdbatl 0,r11 ; Set sink lower DBAT
118 mtdbatu 0,r7 ; Set sink upper DBAT
120 beq- cr1,bcpvsame ; Source and sink are in same block
122 mtdbatl 1,r12 ; Set source lower DBAT
123 mtdbatu 1,r8 ; Set source upper DBAT
126 sync ; wait for the BATs to stabilize
129 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
131 li r0,0 ; Get set to invalidate upper half of BATs
132 sync ; Make sure all is well
133 mtdbatu 0,r0 ; Clear sink upper DBAT
134 mtdbatu 1,r0 ; Clear source upper DBAT
138 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
139 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
144 ; void bcopy_phys(from, to, nbytes)
146 ; Turns off data translation before the copy. This one will not work in user state.
147 ; This routine is used on 32 and 64-bit machines.
149 ; Note that the address parameters are long longs. We will transform these to 64-bit
150 ; values. Note that on 32-bit architectures that this will ignore the high half of the
151 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
154 ; Also note that you probably will not be happy if either the sink or source spans across the
155 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
156 ; will not check, so be careful.
158 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
159 ; Interrupts may or may not be disabled.
162 .globl EXT(bcopy_phys)
165 mflr r0 ; get return address
166 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
168 mfsprg r8,2 ; get processor feature flags
169 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
170 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
171 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
172 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
173 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
174 mr r5,r7 ; Get the length into the right register
176 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
177 mfmsr r9 ; Get the MSR
178 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
179 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
180 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
181 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
185 mtmsr r9 ; turn DR, FP, and VEC off
188 bl EXT(bcopy) ; do the copy with translation off and caching on
190 mfmsr r9 ; Get the MSR
191 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
192 mtmsr r9 ; restore msr
193 isync ; wait for it to happen
194 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
196 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
200 ; 64-bit: turn DR off and SF on.
202 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
203 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
204 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
205 li r0,1 ; Note - we use this in a couple places below
206 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
207 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
208 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
209 cmpldi cr0,r2,1 ; Is source in I/O memory?
210 cmpldi cr7,r10,1 ; Is sink in I/O memory?
211 mtmsrd r9 ; turn 64-bit addressing on, data translation off
212 isync ; wait for it to happen
213 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
214 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
216 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
219 mfmsr r9 ; Get the MSR we used to copy
220 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
221 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
222 mtmsrd r9 ; turn 64-bit mode off, translation back on
223 isync ; wait for it to happen
224 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
225 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
227 mtmsrd r8,1 ; turn EE back on if necessary
228 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
231 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
232 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
233 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
234 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
236 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
237 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
238 sldi r0,r0,32+8 ; Get the right bit to turn off caching
239 andc r9,r9,r11 ; Turn off EE bit
240 mfspr r2,hid4 ; Get HID4
241 mtmsrd r9,1 ; Force off EE
242 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
244 mtspr hid4,r2 ; Make real accesses cache-inhibited
245 isync ; Toss prefetches
247 lis r12,0xE000 ; Get the unlikeliest ESID possible
248 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
249 slbie r12 ; Make sure the ERAT is cleared
254 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
257 sldi r0,r0,32+8 ; Get the right bit to turn off caching
258 mfspr r2,hid4 ; Get HID4
259 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
261 mtspr hid4,r2 ; Make real accesses not cache-inhibited
262 isync ; Toss prefetches
264 lis r12,0xE000 ; Get the unlikeliest ESID possible
265 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
266 slbie r12 ; Make sure the ERAT is cleared
273 ; Special case short operands (<32 bytes), which are very common. Note that the check for
274 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
275 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
276 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
277 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
278 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
279 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
280 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
281 ; might do unaligned accesses this code cannot be called from bcopy_nc().
285 ; r12 = (dest - source)
289 cmplw r12,r5 ; must move reverse if (dest-source)<length
290 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
291 mtcrf 1,r5 ; ...which is faster on G4 and G5
292 bge++ backend ; handle forward moves (most common case)
293 add r6,r6,r5 ; point one past end of operands in reverse moves
295 b bbackend ; handle reverse moves
298 ; void bcopy(from, to, nbytes)
300 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
301 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
302 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
303 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
304 ; registers to accelerate the copy.
306 ; NOTE: this code can be called in any of three "modes":
307 ; - on 32-bit processors (32-byte cache line)
308 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
309 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
313 .globl EXT(bcopy_nop_if_32bit)
316 cmplwi cr1,r5,kShort ; less than 32 bytes?
317 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
318 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
319 blt cr1,shortcopy ; special case short operands
320 crclr noncache ; Set cached
321 LEXT(bcopy_nop_if_32bit)
322 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
323 bne+ copyit32 ; handle 32-bit processor
324 blr ; to==from so nothing to do
327 ; bcopy_nc(from, to, nbytes)
329 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
330 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
331 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
332 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
336 .globl EXT(bcopy_nc_nop_if_32bit)
339 cmpwi cr1,r5,0 ; Check if we have a 0 length
340 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
341 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
342 crset noncache ; Set non-cached
343 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
344 LEXT(bcopy_nc_nop_if_32bit)
345 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
346 bne+ copyit32 ; handle 32-bit processor
347 blr ; either zero length or to==from
350 ; void* memcpy(to, from, nbytes)
351 ; void* memmove(to, from, nbytes)
353 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
354 ; However, they would work correctly if called in 64-bit mode.
359 .globl EXT(memcpy_nop_if_32bit)
363 cmplwi cr1,r5,kShort ; less than 32 bytes?
364 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
365 mr r6,r4 ; Set source
366 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
367 blt cr1,shortcopy ; special case short operands
368 crclr noncache ; Set cached
369 LEXT(memcpy_nop_if_32bit)
370 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
371 beqlr- ; exit if to==from
374 ; Here to copy on 32-bit processors.
376 ; When we move the memory, forward overlays must be handled. We
377 ; also can not use the cache instructions if we are from bcopy_nc.
378 ; We need to preserve R3 because it needs to be returned for memcpy.
379 ; We can be interrupted and lose control here.
385 ; r12 = (dest - source)
386 ; cr5 = noncache flag
388 copyit32: ; WARNING! can drop down to this label
389 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
390 cntlzw r11,r5 ; get magnitude of length
391 dcbt 0,r6 ; start to touch in source
392 lis r10,hi16(0x80000000) ; get 0x80000000
393 neg r9,r4 ; start to get alignment for destination
394 dcbtst 0,r4 ; start to touch in destination
395 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
396 blt- cr1,reverse32bit ; reverse move required
398 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
399 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
400 ; word aligned. We depend on this in the uncached case on 64-bit processors.
404 ; r8 = inverse of largest mask smaller than operand length
405 ; r9 = neg(dest), used to compute alignment
406 ; cr5 = noncache flag
408 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
409 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
410 andc. r0,r7,r8 ; limit to the maximum front end move
411 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
412 beq alline ; Already on a line...
414 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
415 sub r5,r5,r0 ; Set the length left to move
417 bf 31,alhalf ; No single byte to do...
418 lbz r7,0(r6) ; Get the byte
419 addi r6,r6,1 ; Point to the next
420 stb r7,0(r4) ; Save the single
421 addi r4,r4,1 ; Bump sink
423 ; Sink is halfword aligned here
425 alhalf: bf 30,alword ; No halfword to do...
426 lhz r7,0(r6) ; Get the halfword
427 addi r6,r6,2 ; Point to the next
428 sth r7,0(r4) ; Save the halfword
429 addi r4,r4,2 ; Bump sink
431 ; Sink is word aligned here
433 alword: bf 29,aldouble ; No word to do...
434 lwz r7,0(r6) ; Get the word
435 addi r6,r6,4 ; Point to the next
436 stw r7,0(r4) ; Save the word
437 addi r4,r4,4 ; Bump sink
439 ; Sink is double aligned here
441 aldouble: bf 28,alquad ; No double to do...
442 lwz r7,0(r6) ; Get the first word
443 lwz r8,4(r6) ; Get the second word
444 addi r6,r6,8 ; Point to the next
445 stw r7,0(r4) ; Save the first word
446 stw r8,4(r4) ; Save the second word
447 addi r4,r4,8 ; Bump sink
449 ; Sink is quadword aligned here
451 alquad: bf 27,alline ; No quad to do...
452 lwz r7,0(r6) ; Get the first word
453 lwz r8,4(r6) ; Get the second word
454 lwz r9,8(r6) ; Get the third word
455 stw r7,0(r4) ; Save the first word
456 lwz r11,12(r6) ; Get the fourth word
457 addi r6,r6,16 ; Point to the next
458 stw r8,4(r4) ; Save the second word
459 stw r9,8(r4) ; Save the third word
460 stw r11,12(r4) ; Save the fourth word
461 addi r4,r4,16 ; Bump sink
463 ; Sink is line aligned here
465 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
466 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
467 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
468 beq- backend ; No full lines to move
470 mtctr r0 ; set up loop count
471 li r0,96 ; Stride for touch ahead
476 lwz r2,0(r6) ; Get the first word
477 lwz r5,4(r6) ; Get the second word
478 lwz r7,8(r6) ; Get the third word
479 lwz r8,12(r6) ; Get the fourth word
480 lwz r9,16(r6) ; Get the fifth word
481 lwz r10,20(r6) ; Get the sixth word
482 lwz r11,24(r6) ; Get the seventh word
483 lwz r12,28(r6) ; Get the eighth word
484 bt- noncache,skipz ; Skip if we are not cached...
485 dcbz 0,r4 ; Blow away the whole line because we are replacing it
486 dcbt r6,r0 ; Touch ahead a bit
488 addi r6,r6,32 ; Point to the next
489 stw r2,0(r4) ; Save the first word
490 stw r5,4(r4) ; Save the second word
491 stw r7,8(r4) ; Save the third word
492 stw r8,12(r4) ; Save the fourth word
493 stw r9,16(r4) ; Save the fifth word
494 stw r10,20(r4) ; Save the sixth word
495 stw r11,24(r4) ; Save the seventh word
496 stw r12,28(r4) ; Save the eighth word
497 addi r4,r4,32 ; Bump sink
498 bdnz+ nxtline ; Do the next line, if any...
501 ; Move backend quadword
503 backend: ; Join here from "shortcopy" for forward moves <32 bytes
504 bf 27,noquad ; No quad to do...
505 lwz r7,0(r6) ; Get the first word
506 lwz r8,4(r6) ; Get the second word
507 lwz r9,8(r6) ; Get the third word
508 lwz r11,12(r6) ; Get the fourth word
509 stw r7,0(r4) ; Save the first word
510 addi r6,r6,16 ; Point to the next
511 stw r8,4(r4) ; Save the second word
512 stw r9,8(r4) ; Save the third word
513 stw r11,12(r4) ; Save the fourth word
514 addi r4,r4,16 ; Bump sink
516 ; Move backend double
518 noquad: bf 28,nodouble ; No double to do...
519 lwz r7,0(r6) ; Get the first word
520 lwz r8,4(r6) ; Get the second word
521 addi r6,r6,8 ; Point to the next
522 stw r7,0(r4) ; Save the first word
523 stw r8,4(r4) ; Save the second word
524 addi r4,r4,8 ; Bump sink
528 nodouble: bf 29,noword ; No word to do...
529 lwz r7,0(r6) ; Get the word
530 addi r6,r6,4 ; Point to the next
531 stw r7,0(r4) ; Save the word
532 addi r4,r4,4 ; Bump sink
534 ; Move backend halfword
536 noword: bf 30,nohalf ; No halfword to do...
537 lhz r7,0(r6) ; Get the halfword
538 addi r6,r6,2 ; Point to the next
539 sth r7,0(r4) ; Save the halfword
540 addi r4,r4,2 ; Bump sink
544 nohalf: bflr 31 ; Leave cuz we are all done...
545 lbz r7,0(r6) ; Get the byte
546 stb r7,0(r4) ; Save the single
550 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
551 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
552 ; word aligned. We depend on this in the uncached case on 64-bit processors.
553 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
557 ; r8 = inverse of largest mask smaller than operand length
558 ; cr5 = noncache flag (but we don't dcbz anyway)
560 reverse32bit: ; here from 64-bit code with word aligned uncached operands
561 add r4,r5,r4 ; Point past the last sink byte
562 add r6,r5,r6 ; Point past the last source byte
563 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
564 li r12,-1 ; Make sure we touch in the actual line
565 andc. r0,r7,r8 ; Apply movement limit
566 dcbt r12,r6 ; Touch in the last line of source
567 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
568 dcbtst r12,r4 ; Touch in the last line of the sink
569 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
570 beq- balline ; Aready on cache line boundary (or too short to bother)
572 sub r5,r5,r0 ; Precaculate move length left after alignment
574 bf 31,balhalf ; No single byte to do...
575 lbz r7,-1(r6) ; Get the byte
576 subi r6,r6,1 ; Point to the next
577 stb r7,-1(r4) ; Save the single
578 subi r4,r4,1 ; Bump sink
580 ; Sink is halfword aligned here
582 balhalf: bf 30,balword ; No halfword to do...
583 lhz r7,-2(r6) ; Get the halfword
584 subi r6,r6,2 ; Point to the next
585 sth r7,-2(r4) ; Save the halfword
586 subi r4,r4,2 ; Bump sink
588 ; Sink is word aligned here
590 balword: bf 29,baldouble ; No word to do...
591 lwz r7,-4(r6) ; Get the word
592 subi r6,r6,4 ; Point to the next
593 stw r7,-4(r4) ; Save the word
594 subi r4,r4,4 ; Bump sink
596 ; Sink is double aligned here
598 baldouble: bf 28,balquad ; No double to do...
599 lwz r7,-8(r6) ; Get the first word
600 lwz r8,-4(r6) ; Get the second word
601 subi r6,r6,8 ; Point to the next
602 stw r7,-8(r4) ; Save the first word
603 stw r8,-4(r4) ; Save the second word
604 subi r4,r4,8 ; Bump sink
606 ; Sink is quadword aligned here
608 balquad: bf 27,balline ; No quad to do...
609 lwz r7,-16(r6) ; Get the first word
610 lwz r8,-12(r6) ; Get the second word
611 lwz r9,-8(r6) ; Get the third word
612 lwz r11,-4(r6) ; Get the fourth word
613 stw r7,-16(r4) ; Save the first word
614 subi r6,r6,16 ; Point to the next
615 stw r8,-12(r4) ; Save the second word
616 stw r9,-8(r4) ; Save the third word
617 stw r11,-4(r4) ; Save the fourth word
618 subi r4,r4,16 ; Bump sink
620 ; Sink is line aligned here
622 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
623 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
624 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
625 beq- bbackend ; No full lines to move
626 mtctr r0 ; set up loop count
631 lwz r7,-32(r6) ; Get the first word
632 lwz r5,-28(r6) ; Get the second word
633 lwz r2,-24(r6) ; Get the third word
634 lwz r12,-20(r6) ; Get the third word
635 lwz r11,-16(r6) ; Get the fifth word
636 lwz r10,-12(r6) ; Get the sixth word
637 lwz r9,-8(r6) ; Get the seventh word
638 lwz r8,-4(r6) ; Get the eighth word
639 subi r6,r6,32 ; Point to the next
641 stw r7,-32(r4) ; Get the first word
642 stw r5,-28(r4) ; Get the second word
643 stw r2,-24(r4) ; Get the third word
644 stw r12,-20(r4) ; Get the third word
645 stw r11,-16(r4) ; Get the fifth word
646 stw r10,-12(r4) ; Get the sixth word
647 stw r9,-8(r4) ; Get the seventh word
648 stw r8,-4(r4) ; Get the eighth word
649 subi r4,r4,32 ; Bump sink
651 bdnz+ bnxtline ; Do the next line, if any...
654 ; Note: We touched these lines in at the beginning
657 ; Move backend quadword
659 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
660 bf 27,bnoquad ; No quad to do...
661 lwz r7,-16(r6) ; Get the first word
662 lwz r8,-12(r6) ; Get the second word
663 lwz r9,-8(r6) ; Get the third word
664 lwz r11,-4(r6) ; Get the fourth word
665 stw r7,-16(r4) ; Save the first word
666 subi r6,r6,16 ; Point to the next
667 stw r8,-12(r4) ; Save the second word
668 stw r9,-8(r4) ; Save the third word
669 stw r11,-4(r4) ; Save the fourth word
670 subi r4,r4,16 ; Bump sink
672 ; Move backend double
674 bnoquad: bf 28,bnodouble ; No double to do...
675 lwz r7,-8(r6) ; Get the first word
676 lwz r8,-4(r6) ; Get the second word
677 subi r6,r6,8 ; Point to the next
678 stw r7,-8(r4) ; Save the first word
679 stw r8,-4(r4) ; Save the second word
680 subi r4,r4,8 ; Bump sink
684 bnodouble: bf 29,bnoword ; No word to do...
685 lwz r7,-4(r6) ; Get the word
686 subi r6,r6,4 ; Point to the next
687 stw r7,-4(r4) ; Save the word
688 subi r4,r4,4 ; Bump sink
690 ; Move backend halfword
692 bnoword: bf 30,bnohalf ; No halfword to do...
693 lhz r7,-2(r6) ; Get the halfword
694 subi r6,r6,2 ; Point to the next
695 sth r7,-2(r4) ; Save the halfword
696 subi r4,r4,2 ; Bump sink
700 bnohalf: bflr 31 ; Leave cuz we are all done...
701 lbz r7,-1(r6) ; Get the byte
702 stb r7,-1(r4) ; Save the single
706 // Here on 64-bit processors, which have a 128-byte cache line. This can be
707 // called either in 32 or 64-bit mode, which makes the test for reverse moves
708 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
712 // r4 = destination (32 or 64-bit ptr)
713 // r5 = length (always 32 bits)
714 // r6 = source (32 or 64-bit ptr)
715 // r12 = (dest - source), reverse move required if (dest-source)<length
716 // cr5 = noncache flag
720 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
721 cntlzw r11,r5 // get magnitude of length
722 dcbt 0,r6 // touch in 1st block of source
723 dcbtst 0,r4 // touch in 1st destination cache block
724 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
726 lis r10,hi16(0x80000000)// get 0x80000000
727 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
728 neg r9,r4 // start to get alignment for destination
729 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
730 bt-- noncache,c64uncached// skip if uncached
731 beq-- c64rdouble // handle cached reverse moves
734 // Forward, cached or doubleword aligned uncached. This is the common case.
735 // NOTE: we never do an unaligned access if the source and destination are "relatively"
736 // doubleword aligned. We depend on this in the uncached case.
740 // r8 = inverse of largest mask smaller than operand length
741 // r9 = neg(dest), used to compute alignment
742 // cr5 = noncache flag
745 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
746 andc r7,r7,r8 // limit by operand length
747 andi. r8,r7,7 // r8 <- #bytes to doubleword align
748 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
749 sub r5,r5,r7 // adjust length remaining
750 cmpwi cr1,r9,0 // any doublewords to move to cache align?
751 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
752 cmpwi cr7,r10,0 // set cr7 on chunk count
753 beq c64double2 // dest already doubleword aligned
757 .align 5 // align inner loops
758 c64double1: // copy bytes until dest is doubleword aligned
765 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
766 beq cr1,c64double4 // no doublewords to xfer in order to cache align
770 .align 5 // align inner loops
771 c64double3: // copy doublewords until dest is 128-byte aligned
778 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
779 // data (64 bytes), we load/store each twice per 128-byte chunk.
781 c64double4: // r10/cr7=128-byte chunks
782 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
783 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
784 beq cr7,c64double7 // no 128-byte chunks
786 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
787 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
789 sub r8,r6,r4 // r8 <- (source - dest)
790 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
791 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
795 .align 5 // align inner loop
796 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
797 ld r0,0(r6) // start pipe: load 1st half-line
805 bt noncache,c64InnerLoop1 // skip if uncached or overlap
806 dcbz128 0,r4 // avoid prefetch of next cache line
818 ld r0,64(r6) // load 2nd half of chunk
836 addi r4,r4,128 // advance to next dest chunk
838 bdnz c64InnerLoop // loop if more chunks
841 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
842 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
843 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
844 beq cr1,c64byte // no leftover doublewords
848 .align 5 // align inner loop
849 c64double8: // loop copying leftover doublewords
857 // Forward byte loop.
859 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
860 beqlr // done if no leftover bytes
864 .align 5 // align inner loop
875 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
876 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
877 // a byte at a time, but that is still much faster than alignment exceptions.
881 // r8 = inverse of largest mask smaller than operand length
882 // r9 = neg(dest), used to compute alignment
883 // r12 = (dest-source), used to test relative alignment
884 // cr0 = beq if reverse move required
885 // cr5 = noncache flag
888 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
889 rlwinm r11,r12,0,30,31 // relatively word aligned?
890 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
891 cmpwi cr1,r11,0 // set cr1 beq if word aligned
892 beq-- c64reverseUncached
894 beq cr7,c64double // doubleword aligned
895 beq cr1,forward32bit // word aligned, use G3/G4 code
896 cmpwi r5,0 // set cr0 on byte count
897 b c64byte // unaligned operands
900 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
901 beq cr1,reverse32bit // word aligned, use G3/G4 code
902 add r6,r6,r5 // point to (end+1) of source and dest
904 cmpwi r5,0 // set cr0 on length
905 b c64rbyte // copy a byte at a time
909 // Reverse doubleword copies. This is used for all cached copies, and doubleword
910 // aligned uncached copies.
914 // r8 = inverse of largest mask of low-order 1s smaller than operand length
915 // cr5 = noncache flag
918 add r6,r6,r5 // point to (end+1) of source and dest
920 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
921 andc. r7,r7,r8 // limit by operand length
922 sub r5,r5,r7 // adjust length
923 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
924 cmpwi cr1,r8,0 // any chunks?
925 beq c64rd2 // source already doubleword aligned
928 c64rd1: // copy bytes until source doublword aligned
933 c64rd2: // r8/cr1 <- count of 64-byte chunks
934 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
935 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
936 cmpwi cr7,r0,0 // leftover doublewords?
937 beq cr1,c64rd4 // no chunks to xfer
941 .align 5 // align inner loop
942 c64rd3: // loop copying 64-byte chunks
961 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
962 beq cr7,c64rbyte // no leftover doublewords
965 c64rd5: // loop copying leftover doublewords
971 // Reverse byte loop.
973 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
974 beqlr // done if no leftover bytes