2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
24 ; Copy bytes of data around. Handles overlapped data.
28 #include <ppc/proc_reg.h>
31 ; These routines use CR5 for certain flags:
32 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
36 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
37 #define BCOPY_SF_SIZE 32 // total size
38 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
41 #define kShort 32 // short operands are special cased
44 ; void bcopy_physvir_32(from, to, nbytes)
46 ; Attempt to copy physically addressed memory with translation on if conditions are met.
47 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
48 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
49 ; for the passed phys addrs and do the copy with translation on.
51 ; Rules are: - neither source nor destination can cross a page.
52 ; - Interrupts must be disabled when this routine is called.
53 ; - Translation must be on when called.
55 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
56 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
57 ; address is the same as the virtual address of the instructions we are executing.
59 ; At the end, we invalidate the used DBATs.
61 ; Note that the address parameters are long longs. We will transform these to 64-bit
62 ; values. Note that on 32-bit architectures that this will ignore the high half of the
63 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
66 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
67 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
68 ; for an example of how this is done.
71 .globl EXT(bcopy_physvir_32)
73 LEXT(bcopy_physvir_32)
74 mflr r0 ; get return address
75 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
76 mfsprg r8,2 ; get processor feature flags
77 stw r0,8(r1) ; save return address
78 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
79 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
80 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
81 subi r0,r7,1 ; get length - 1
82 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
83 add r11,r3,r0 ; Point to last byte of sink
84 mr r5,r7 ; Get the length into the right register
85 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
87 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
88 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
90 add r12,r4,r0 ; Point to last byte of source
91 xor r7,r11,r3 ; See if we went to next page
92 xor r8,r12,r4 ; See if we went to next page
93 or r0,r7,r8 ; Combine wrap
95 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
96 li r9,((2<<3)|2) ; Set default attributes
97 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
98 li r7,2 ; Set validity flags
99 li r8,2 ; Set validity flags
100 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
102 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
103 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
104 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
105 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
106 cmplw cr1,r11,r12 ; See if sink and source are same block
110 mtdbatl 0,r11 ; Set sink lower DBAT
111 mtdbatu 0,r7 ; Set sink upper DBAT
113 beq- cr1,bcpvsame ; Source and sink are in same block
115 mtdbatl 1,r12 ; Set source lower DBAT
116 mtdbatu 1,r8 ; Set source upper DBAT
119 sync ; wait for the BATs to stabilize
122 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
124 li r0,0 ; Get set to invalidate upper half of BATs
125 sync ; Make sure all is well
126 mtdbatu 0,r0 ; Clear sink upper DBAT
127 mtdbatu 1,r0 ; Clear source upper DBAT
131 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
132 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
137 ; void bcopy_phys(from, to, nbytes)
139 ; Turns off data translation before the copy. This one will not work in user state.
140 ; This routine is used on 32 and 64-bit machines.
142 ; Note that the address parameters are long longs. We will transform these to 64-bit
143 ; values. Note that on 32-bit architectures that this will ignore the high half of the
144 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
147 ; Also note that you probably will not be happy if either the sink or source spans across the
148 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
149 ; will not check, so be careful.
151 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
152 ; Interrupts may or may not be disabled.
155 .globl EXT(bcopy_phys)
158 mflr r0 ; get return address
159 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
161 mfsprg r8,2 ; get processor feature flags
162 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
163 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
164 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
165 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
166 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
167 mr r5,r7 ; Get the length into the right register
169 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
170 mfmsr r9 ; Get the MSR
171 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
172 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
173 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
174 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
178 mtmsr r9 ; turn DR, FP, and VEC off
181 bl EXT(bcopy) ; do the copy with translation off and caching on
183 mfmsr r9 ; Get the MSR
184 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
185 mtmsr r9 ; restore msr
186 isync ; wait for it to happen
187 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
189 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
193 ; 64-bit: turn DR off and SF on.
195 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
196 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
197 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
198 li r0,1 ; Note - we use this in a couple places below
199 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
200 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
201 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
202 cmpldi cr0,r2,1 ; Is source in I/O memory?
203 cmpldi cr7,r10,1 ; Is sink in I/O memory?
204 mtmsrd r9 ; turn 64-bit addressing on, data translation off
205 isync ; wait for it to happen
206 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
207 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
209 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
212 mfmsr r9 ; Get the MSR we used to copy
213 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
214 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
215 mtmsrd r9 ; turn 64-bit mode off, translation back on
216 isync ; wait for it to happen
217 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
218 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
220 mtmsrd r8,1 ; turn EE back on if necessary
221 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
224 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
225 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
226 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
227 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
229 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
230 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
231 sldi r0,r0,32+8 ; Get the right bit to turn off caching
232 andc r9,r9,r11 ; Turn off EE bit
233 mfspr r2,hid4 ; Get HID4
234 mtmsrd r9,1 ; Force off EE
235 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
237 mtspr hid4,r2 ; Make real accesses cache-inhibited
238 isync ; Toss prefetches
240 lis r12,0xE000 ; Get the unlikeliest ESID possible
241 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
242 slbie r12 ; Make sure the ERAT is cleared
247 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
250 sldi r0,r0,32+8 ; Get the right bit to turn off caching
251 mfspr r2,hid4 ; Get HID4
252 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
254 mtspr hid4,r2 ; Make real accesses not cache-inhibited
255 isync ; Toss prefetches
257 lis r12,0xE000 ; Get the unlikeliest ESID possible
258 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
259 slbie r12 ; Make sure the ERAT is cleared
266 ; Special case short operands (<32 bytes), which are very common. Note that the check for
267 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
268 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
269 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
270 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
271 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
272 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
273 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
274 ; might do unaligned accesses this code cannot be called from bcopy_nc().
278 ; r12 = (dest - source)
282 cmplw r12,r5 ; must move reverse if (dest-source)<length
283 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
284 mtcrf 1,r5 ; ...which is faster on G4 and G5
285 bge++ backend ; handle forward moves (most common case)
286 add r6,r6,r5 ; point one past end of operands in reverse moves
288 b bbackend ; handle reverse moves
291 ; void bcopy(from, to, nbytes)
293 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
294 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
295 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
296 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
297 ; registers to accelerate the copy.
299 ; NOTE: this code can be called in any of three "modes":
300 ; - on 32-bit processors (32-byte cache line)
301 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
302 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
306 .globl EXT(bcopy_nop_if_32bit)
309 cmplwi cr1,r5,kShort ; less than 32 bytes?
310 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
311 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
312 blt cr1,shortcopy ; special case short operands
313 crclr noncache ; Set cached
314 LEXT(bcopy_nop_if_32bit)
315 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
316 bne+ copyit32 ; handle 32-bit processor
317 blr ; to==from so nothing to do
320 ; bcopy_nc(from, to, nbytes)
322 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
323 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
324 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
325 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
329 .globl EXT(bcopy_nc_nop_if_32bit)
332 cmpwi cr1,r5,0 ; Check if we have a 0 length
333 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
334 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
335 crset noncache ; Set non-cached
336 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
337 LEXT(bcopy_nc_nop_if_32bit)
338 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
339 bne+ copyit32 ; handle 32-bit processor
340 blr ; either zero length or to==from
343 ; void* memcpy(to, from, nbytes)
344 ; void* memmove(to, from, nbytes)
346 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
347 ; However, they would work correctly if called in 64-bit mode.
352 .globl EXT(memcpy_nop_if_32bit)
356 cmplwi cr1,r5,kShort ; less than 32 bytes?
357 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
358 mr r6,r4 ; Set source
359 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
360 blt cr1,shortcopy ; special case short operands
361 crclr noncache ; Set cached
362 LEXT(memcpy_nop_if_32bit)
363 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
364 beqlr- ; exit if to==from
367 ; Here to copy on 32-bit processors.
369 ; When we move the memory, forward overlays must be handled. We
370 ; also can not use the cache instructions if we are from bcopy_nc.
371 ; We need to preserve R3 because it needs to be returned for memcpy.
372 ; We can be interrupted and lose control here.
378 ; r12 = (dest - source)
379 ; cr5 = noncache flag
381 copyit32: ; WARNING! can drop down to this label
382 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
383 cntlzw r11,r5 ; get magnitude of length
384 dcbt 0,r6 ; start to touch in source
385 lis r10,hi16(0x80000000) ; get 0x80000000
386 neg r9,r4 ; start to get alignment for destination
387 dcbtst 0,r4 ; start to touch in destination
388 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
389 blt- cr1,reverse32bit ; reverse move required
391 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
392 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
393 ; word aligned. We depend on this in the uncached case on 64-bit processors.
397 ; r8 = inverse of largest mask smaller than operand length
398 ; r9 = neg(dest), used to compute alignment
399 ; cr5 = noncache flag
401 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
402 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
403 andc. r0,r7,r8 ; limit to the maximum front end move
404 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
405 beq alline ; Already on a line...
407 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
408 sub r5,r5,r0 ; Set the length left to move
410 bf 31,alhalf ; No single byte to do...
411 lbz r7,0(r6) ; Get the byte
412 addi r6,r6,1 ; Point to the next
413 stb r7,0(r4) ; Save the single
414 addi r4,r4,1 ; Bump sink
416 ; Sink is halfword aligned here
418 alhalf: bf 30,alword ; No halfword to do...
419 lhz r7,0(r6) ; Get the halfword
420 addi r6,r6,2 ; Point to the next
421 sth r7,0(r4) ; Save the halfword
422 addi r4,r4,2 ; Bump sink
424 ; Sink is word aligned here
426 alword: bf 29,aldouble ; No word to do...
427 lwz r7,0(r6) ; Get the word
428 addi r6,r6,4 ; Point to the next
429 stw r7,0(r4) ; Save the word
430 addi r4,r4,4 ; Bump sink
432 ; Sink is double aligned here
434 aldouble: bf 28,alquad ; No double to do...
435 lwz r7,0(r6) ; Get the first word
436 lwz r8,4(r6) ; Get the second word
437 addi r6,r6,8 ; Point to the next
438 stw r7,0(r4) ; Save the first word
439 stw r8,4(r4) ; Save the second word
440 addi r4,r4,8 ; Bump sink
442 ; Sink is quadword aligned here
444 alquad: bf 27,alline ; No quad to do...
445 lwz r7,0(r6) ; Get the first word
446 lwz r8,4(r6) ; Get the second word
447 lwz r9,8(r6) ; Get the third word
448 stw r7,0(r4) ; Save the first word
449 lwz r11,12(r6) ; Get the fourth word
450 addi r6,r6,16 ; Point to the next
451 stw r8,4(r4) ; Save the second word
452 stw r9,8(r4) ; Save the third word
453 stw r11,12(r4) ; Save the fourth word
454 addi r4,r4,16 ; Bump sink
456 ; Sink is line aligned here
458 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
459 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
460 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
461 beq- backend ; No full lines to move
463 mtctr r0 ; set up loop count
464 li r0,96 ; Stride for touch ahead
469 lwz r2,0(r6) ; Get the first word
470 lwz r5,4(r6) ; Get the second word
471 lwz r7,8(r6) ; Get the third word
472 lwz r8,12(r6) ; Get the fourth word
473 lwz r9,16(r6) ; Get the fifth word
474 lwz r10,20(r6) ; Get the sixth word
475 lwz r11,24(r6) ; Get the seventh word
476 lwz r12,28(r6) ; Get the eighth word
477 bt- noncache,skipz ; Skip if we are not cached...
478 dcbz 0,r4 ; Blow away the whole line because we are replacing it
479 dcbt r6,r0 ; Touch ahead a bit
481 addi r6,r6,32 ; Point to the next
482 stw r2,0(r4) ; Save the first word
483 stw r5,4(r4) ; Save the second word
484 stw r7,8(r4) ; Save the third word
485 stw r8,12(r4) ; Save the fourth word
486 stw r9,16(r4) ; Save the fifth word
487 stw r10,20(r4) ; Save the sixth word
488 stw r11,24(r4) ; Save the seventh word
489 stw r12,28(r4) ; Save the eighth word
490 addi r4,r4,32 ; Bump sink
491 bdnz+ nxtline ; Do the next line, if any...
494 ; Move backend quadword
496 backend: ; Join here from "shortcopy" for forward moves <32 bytes
497 bf 27,noquad ; No quad to do...
498 lwz r7,0(r6) ; Get the first word
499 lwz r8,4(r6) ; Get the second word
500 lwz r9,8(r6) ; Get the third word
501 lwz r11,12(r6) ; Get the fourth word
502 stw r7,0(r4) ; Save the first word
503 addi r6,r6,16 ; Point to the next
504 stw r8,4(r4) ; Save the second word
505 stw r9,8(r4) ; Save the third word
506 stw r11,12(r4) ; Save the fourth word
507 addi r4,r4,16 ; Bump sink
509 ; Move backend double
511 noquad: bf 28,nodouble ; No double to do...
512 lwz r7,0(r6) ; Get the first word
513 lwz r8,4(r6) ; Get the second word
514 addi r6,r6,8 ; Point to the next
515 stw r7,0(r4) ; Save the first word
516 stw r8,4(r4) ; Save the second word
517 addi r4,r4,8 ; Bump sink
521 nodouble: bf 29,noword ; No word to do...
522 lwz r7,0(r6) ; Get the word
523 addi r6,r6,4 ; Point to the next
524 stw r7,0(r4) ; Save the word
525 addi r4,r4,4 ; Bump sink
527 ; Move backend halfword
529 noword: bf 30,nohalf ; No halfword to do...
530 lhz r7,0(r6) ; Get the halfword
531 addi r6,r6,2 ; Point to the next
532 sth r7,0(r4) ; Save the halfword
533 addi r4,r4,2 ; Bump sink
537 nohalf: bflr 31 ; Leave cuz we are all done...
538 lbz r7,0(r6) ; Get the byte
539 stb r7,0(r4) ; Save the single
543 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
544 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
545 ; word aligned. We depend on this in the uncached case on 64-bit processors.
546 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
550 ; r8 = inverse of largest mask smaller than operand length
551 ; cr5 = noncache flag (but we don't dcbz anyway)
553 reverse32bit: ; here from 64-bit code with word aligned uncached operands
554 add r4,r5,r4 ; Point past the last sink byte
555 add r6,r5,r6 ; Point past the last source byte
556 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
557 li r12,-1 ; Make sure we touch in the actual line
558 andc. r0,r7,r8 ; Apply movement limit
559 dcbt r12,r6 ; Touch in the last line of source
560 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
561 dcbtst r12,r4 ; Touch in the last line of the sink
562 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
563 beq- balline ; Aready on cache line boundary (or too short to bother)
565 sub r5,r5,r0 ; Precaculate move length left after alignment
567 bf 31,balhalf ; No single byte to do...
568 lbz r7,-1(r6) ; Get the byte
569 subi r6,r6,1 ; Point to the next
570 stb r7,-1(r4) ; Save the single
571 subi r4,r4,1 ; Bump sink
573 ; Sink is halfword aligned here
575 balhalf: bf 30,balword ; No halfword to do...
576 lhz r7,-2(r6) ; Get the halfword
577 subi r6,r6,2 ; Point to the next
578 sth r7,-2(r4) ; Save the halfword
579 subi r4,r4,2 ; Bump sink
581 ; Sink is word aligned here
583 balword: bf 29,baldouble ; No word to do...
584 lwz r7,-4(r6) ; Get the word
585 subi r6,r6,4 ; Point to the next
586 stw r7,-4(r4) ; Save the word
587 subi r4,r4,4 ; Bump sink
589 ; Sink is double aligned here
591 baldouble: bf 28,balquad ; No double to do...
592 lwz r7,-8(r6) ; Get the first word
593 lwz r8,-4(r6) ; Get the second word
594 subi r6,r6,8 ; Point to the next
595 stw r7,-8(r4) ; Save the first word
596 stw r8,-4(r4) ; Save the second word
597 subi r4,r4,8 ; Bump sink
599 ; Sink is quadword aligned here
601 balquad: bf 27,balline ; No quad to do...
602 lwz r7,-16(r6) ; Get the first word
603 lwz r8,-12(r6) ; Get the second word
604 lwz r9,-8(r6) ; Get the third word
605 lwz r11,-4(r6) ; Get the fourth word
606 stw r7,-16(r4) ; Save the first word
607 subi r6,r6,16 ; Point to the next
608 stw r8,-12(r4) ; Save the second word
609 stw r9,-8(r4) ; Save the third word
610 stw r11,-4(r4) ; Save the fourth word
611 subi r4,r4,16 ; Bump sink
613 ; Sink is line aligned here
615 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
616 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
617 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
618 beq- bbackend ; No full lines to move
619 mtctr r0 ; set up loop count
624 lwz r7,-32(r6) ; Get the first word
625 lwz r5,-28(r6) ; Get the second word
626 lwz r2,-24(r6) ; Get the third word
627 lwz r12,-20(r6) ; Get the third word
628 lwz r11,-16(r6) ; Get the fifth word
629 lwz r10,-12(r6) ; Get the sixth word
630 lwz r9,-8(r6) ; Get the seventh word
631 lwz r8,-4(r6) ; Get the eighth word
632 subi r6,r6,32 ; Point to the next
634 stw r7,-32(r4) ; Get the first word
635 stw r5,-28(r4) ; Get the second word
636 stw r2,-24(r4) ; Get the third word
637 stw r12,-20(r4) ; Get the third word
638 stw r11,-16(r4) ; Get the fifth word
639 stw r10,-12(r4) ; Get the sixth word
640 stw r9,-8(r4) ; Get the seventh word
641 stw r8,-4(r4) ; Get the eighth word
642 subi r4,r4,32 ; Bump sink
644 bdnz+ bnxtline ; Do the next line, if any...
647 ; Note: We touched these lines in at the beginning
650 ; Move backend quadword
652 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
653 bf 27,bnoquad ; No quad to do...
654 lwz r7,-16(r6) ; Get the first word
655 lwz r8,-12(r6) ; Get the second word
656 lwz r9,-8(r6) ; Get the third word
657 lwz r11,-4(r6) ; Get the fourth word
658 stw r7,-16(r4) ; Save the first word
659 subi r6,r6,16 ; Point to the next
660 stw r8,-12(r4) ; Save the second word
661 stw r9,-8(r4) ; Save the third word
662 stw r11,-4(r4) ; Save the fourth word
663 subi r4,r4,16 ; Bump sink
665 ; Move backend double
667 bnoquad: bf 28,bnodouble ; No double to do...
668 lwz r7,-8(r6) ; Get the first word
669 lwz r8,-4(r6) ; Get the second word
670 subi r6,r6,8 ; Point to the next
671 stw r7,-8(r4) ; Save the first word
672 stw r8,-4(r4) ; Save the second word
673 subi r4,r4,8 ; Bump sink
677 bnodouble: bf 29,bnoword ; No word to do...
678 lwz r7,-4(r6) ; Get the word
679 subi r6,r6,4 ; Point to the next
680 stw r7,-4(r4) ; Save the word
681 subi r4,r4,4 ; Bump sink
683 ; Move backend halfword
685 bnoword: bf 30,bnohalf ; No halfword to do...
686 lhz r7,-2(r6) ; Get the halfword
687 subi r6,r6,2 ; Point to the next
688 sth r7,-2(r4) ; Save the halfword
689 subi r4,r4,2 ; Bump sink
693 bnohalf: bflr 31 ; Leave cuz we are all done...
694 lbz r7,-1(r6) ; Get the byte
695 stb r7,-1(r4) ; Save the single
699 // Here on 64-bit processors, which have a 128-byte cache line. This can be
700 // called either in 32 or 64-bit mode, which makes the test for reverse moves
701 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
705 // r4 = destination (32 or 64-bit ptr)
706 // r5 = length (always 32 bits)
707 // r6 = source (32 or 64-bit ptr)
708 // r12 = (dest - source), reverse move required if (dest-source)<length
709 // cr5 = noncache flag
713 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
714 cntlzw r11,r5 // get magnitude of length
715 dcbt 0,r6 // touch in 1st block of source
716 dcbtst 0,r4 // touch in 1st destination cache block
717 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
719 lis r10,hi16(0x80000000)// get 0x80000000
720 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
721 neg r9,r4 // start to get alignment for destination
722 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
723 bt-- noncache,c64uncached// skip if uncached
724 beq-- c64rdouble // handle cached reverse moves
727 // Forward, cached or doubleword aligned uncached. This is the common case.
728 // NOTE: we never do an unaligned access if the source and destination are "relatively"
729 // doubleword aligned. We depend on this in the uncached case.
733 // r8 = inverse of largest mask smaller than operand length
734 // r9 = neg(dest), used to compute alignment
735 // cr5 = noncache flag
738 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
739 andc r7,r7,r8 // limit by operand length
740 andi. r8,r7,7 // r8 <- #bytes to doubleword align
741 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
742 sub r5,r5,r7 // adjust length remaining
743 cmpwi cr1,r9,0 // any doublewords to move to cache align?
744 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
745 cmpwi cr7,r10,0 // set cr7 on chunk count
746 beq c64double2 // dest already doubleword aligned
750 .align 5 // align inner loops
751 c64double1: // copy bytes until dest is doubleword aligned
758 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
759 beq cr1,c64double4 // no doublewords to xfer in order to cache align
763 .align 5 // align inner loops
764 c64double3: // copy doublewords until dest is 128-byte aligned
771 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
772 // data (64 bytes), we load/store each twice per 128-byte chunk.
774 c64double4: // r10/cr7=128-byte chunks
775 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
776 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
777 beq cr7,c64double7 // no 128-byte chunks
779 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
780 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
782 sub r8,r6,r4 // r8 <- (source - dest)
783 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
784 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
788 .align 5 // align inner loop
789 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
790 ld r0,0(r6) // start pipe: load 1st half-line
798 bt noncache,c64InnerLoop1 // skip if uncached or overlap
799 dcbz128 0,r4 // avoid prefetch of next cache line
811 ld r0,64(r6) // load 2nd half of chunk
829 addi r4,r4,128 // advance to next dest chunk
831 bdnz c64InnerLoop // loop if more chunks
834 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
835 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
836 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
837 beq cr1,c64byte // no leftover doublewords
841 .align 5 // align inner loop
842 c64double8: // loop copying leftover doublewords
850 // Forward byte loop.
852 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
853 beqlr // done if no leftover bytes
857 .align 5 // align inner loop
868 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
869 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
870 // a byte at a time, but that is still much faster than alignment exceptions.
874 // r8 = inverse of largest mask smaller than operand length
875 // r9 = neg(dest), used to compute alignment
876 // r12 = (dest-source), used to test relative alignment
877 // cr0 = beq if reverse move required
878 // cr5 = noncache flag
881 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
882 rlwinm r11,r12,0,30,31 // relatively word aligned?
883 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
884 cmpwi cr1,r11,0 // set cr1 beq if word aligned
885 beq-- c64reverseUncached
887 beq cr7,c64double // doubleword aligned
888 beq cr1,forward32bit // word aligned, use G3/G4 code
889 cmpwi r5,0 // set cr0 on byte count
890 b c64byte // unaligned operands
893 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
894 beq cr1,reverse32bit // word aligned, use G3/G4 code
895 add r6,r6,r5 // point to (end+1) of source and dest
897 cmpwi r5,0 // set cr0 on length
898 b c64rbyte // copy a byte at a time
902 // Reverse doubleword copies. This is used for all cached copies, and doubleword
903 // aligned uncached copies.
907 // r8 = inverse of largest mask of low-order 1s smaller than operand length
908 // cr5 = noncache flag
911 add r6,r6,r5 // point to (end+1) of source and dest
913 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
914 andc. r7,r7,r8 // limit by operand length
915 sub r5,r5,r7 // adjust length
916 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
917 cmpwi cr1,r8,0 // any chunks?
918 beq c64rd2 // source already doubleword aligned
921 c64rd1: // copy bytes until source doublword aligned
926 c64rd2: // r8/cr1 <- count of 64-byte chunks
927 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
928 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
929 cmpwi cr7,r0,0 // leftover doublewords?
930 beq cr1,c64rd4 // no chunks to xfer
934 .align 5 // align inner loop
935 c64rd3: // loop copying 64-byte chunks
954 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
955 beq cr7,c64rbyte // no leftover doublewords
958 c64rd5: // loop copying leftover doublewords
964 // Reverse byte loop.
966 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
967 beqlr // done if no leftover bytes