2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 ; Copy bytes of data around. Handles overlapped data.
33 #include <ppc/proc_reg.h>
36 ; These routines use CR5 for certain flags:
37 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
41 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
42 #define BCOPY_SF_SIZE 32 // total size
43 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
46 #define kShort 32 // short operands are special cased
49 ; void bcopy_physvir_32(from, to, nbytes)
51 ; Attempt to copy physically addressed memory with translation on if conditions are met.
52 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
53 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
54 ; for the passed phys addrs and do the copy with translation on.
56 ; Rules are: - neither source nor destination can cross a page.
57 ; - Interrupts must be disabled when this routine is called.
58 ; - Translation must be on when called.
60 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
61 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
62 ; address is the same as the virtual address of the instructions we are executing.
64 ; At the end, we invalidate the used DBATs.
66 ; Note that the address parameters are long longs. We will transform these to 64-bit
67 ; values. Note that on 32-bit architectures that this will ignore the high half of the
68 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
71 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
72 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
73 ; for an example of how this is done.
76 .globl EXT(bcopy_physvir_32)
78 LEXT(bcopy_physvir_32)
79 mflr r0 ; get return address
80 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
81 mfsprg r8,2 ; get processor feature flags
82 stw r0,8(r1) ; save return address
83 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
84 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
85 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
86 subi r0,r7,1 ; get length - 1
87 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
88 add r11,r3,r0 ; Point to last byte of sink
89 mr r5,r7 ; Get the length into the right register
90 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
92 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
93 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
95 add r12,r4,r0 ; Point to last byte of source
96 xor r7,r11,r3 ; See if we went to next page
97 xor r8,r12,r4 ; See if we went to next page
98 or r0,r7,r8 ; Combine wrap
100 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
101 li r9,((2<<3)|2) ; Set default attributes
102 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
103 li r7,2 ; Set validity flags
104 li r8,2 ; Set validity flags
105 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
107 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
108 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
109 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
110 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
111 cmplw cr1,r11,r12 ; See if sink and source are same block
115 mtdbatl 0,r11 ; Set sink lower DBAT
116 mtdbatu 0,r7 ; Set sink upper DBAT
118 beq- cr1,bcpvsame ; Source and sink are in same block
120 mtdbatl 1,r12 ; Set source lower DBAT
121 mtdbatu 1,r8 ; Set source upper DBAT
124 sync ; wait for the BATs to stabilize
127 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
129 li r0,0 ; Get set to invalidate upper half of BATs
130 sync ; Make sure all is well
131 mtdbatu 0,r0 ; Clear sink upper DBAT
132 mtdbatu 1,r0 ; Clear source upper DBAT
136 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
137 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
142 ; void bcopy_phys(from, to, nbytes)
144 ; Turns off data translation before the copy. This one will not work in user state.
145 ; This routine is used on 32 and 64-bit machines.
147 ; Note that the address parameters are long longs. We will transform these to 64-bit
148 ; values. Note that on 32-bit architectures that this will ignore the high half of the
149 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
152 ; Also note that you probably will not be happy if either the sink or source spans across the
153 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
154 ; will not check, so be careful.
156 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
157 ; Interrupts may or may not be disabled.
160 .globl EXT(bcopy_phys)
163 mflr r0 ; get return address
164 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
166 mfsprg r8,2 ; get processor feature flags
167 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
168 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
169 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
170 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
171 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
172 mr r5,r7 ; Get the length into the right register
174 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
175 mfmsr r9 ; Get the MSR
176 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
177 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
178 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
179 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
183 mtmsr r9 ; turn DR, FP, and VEC off
186 bl EXT(bcopy) ; do the copy with translation off and caching on
188 mfmsr r9 ; Get the MSR
189 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
190 mtmsr r9 ; restore msr
191 isync ; wait for it to happen
192 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
194 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
198 ; 64-bit: turn DR off and SF on.
200 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
201 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
202 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
203 li r0,1 ; Note - we use this in a couple places below
204 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
205 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
206 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
207 cmpldi cr0,r2,1 ; Is source in I/O memory?
208 cmpldi cr7,r10,1 ; Is sink in I/O memory?
209 mtmsrd r9 ; turn 64-bit addressing on, data translation off
210 isync ; wait for it to happen
211 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
212 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
214 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
217 mfmsr r9 ; Get the MSR we used to copy
218 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
219 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
220 mtmsrd r9 ; turn 64-bit mode off, translation back on
221 isync ; wait for it to happen
222 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
223 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
225 mtmsrd r8,1 ; turn EE back on if necessary
226 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
229 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
230 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
231 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
232 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
234 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
235 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
236 sldi r0,r0,32+8 ; Get the right bit to turn off caching
237 andc r9,r9,r11 ; Turn off EE bit
238 mfspr r2,hid4 ; Get HID4
239 mtmsrd r9,1 ; Force off EE
240 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
242 mtspr hid4,r2 ; Make real accesses cache-inhibited
243 isync ; Toss prefetches
245 lis r12,0xE000 ; Get the unlikeliest ESID possible
246 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
247 slbie r12 ; Make sure the ERAT is cleared
252 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
255 sldi r0,r0,32+8 ; Get the right bit to turn off caching
256 mfspr r2,hid4 ; Get HID4
257 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
259 mtspr hid4,r2 ; Make real accesses not cache-inhibited
260 isync ; Toss prefetches
262 lis r12,0xE000 ; Get the unlikeliest ESID possible
263 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
264 slbie r12 ; Make sure the ERAT is cleared
271 ; Special case short operands (<32 bytes), which are very common. Note that the check for
272 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
273 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
274 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
275 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
276 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
277 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
278 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
279 ; might do unaligned accesses this code cannot be called from bcopy_nc().
283 ; r12 = (dest - source)
287 cmplw r12,r5 ; must move reverse if (dest-source)<length
288 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
289 mtcrf 1,r5 ; ...which is faster on G4 and G5
290 bge++ backend ; handle forward moves (most common case)
291 add r6,r6,r5 ; point one past end of operands in reverse moves
293 b bbackend ; handle reverse moves
296 ; void bcopy(from, to, nbytes)
298 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
299 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
300 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
301 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
302 ; registers to accelerate the copy.
304 ; NOTE: this code can be called in any of three "modes":
305 ; - on 32-bit processors (32-byte cache line)
306 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
307 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
311 .globl EXT(bcopy_nop_if_32bit)
314 cmplwi cr1,r5,kShort ; less than 32 bytes?
315 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
316 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
317 blt cr1,shortcopy ; special case short operands
318 crclr noncache ; Set cached
319 LEXT(bcopy_nop_if_32bit)
320 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
321 bne+ copyit32 ; handle 32-bit processor
322 blr ; to==from so nothing to do
325 ; bcopy_nc(from, to, nbytes)
327 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
328 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
329 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
330 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
334 .globl EXT(bcopy_nc_nop_if_32bit)
337 cmpwi cr1,r5,0 ; Check if we have a 0 length
338 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
339 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
340 crset noncache ; Set non-cached
341 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
342 LEXT(bcopy_nc_nop_if_32bit)
343 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
344 bne+ copyit32 ; handle 32-bit processor
345 blr ; either zero length or to==from
348 ; void* memcpy(to, from, nbytes)
349 ; void* memmove(to, from, nbytes)
351 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
352 ; However, they would work correctly if called in 64-bit mode.
357 .globl EXT(memcpy_nop_if_32bit)
361 cmplwi cr1,r5,kShort ; less than 32 bytes?
362 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
363 mr r6,r4 ; Set source
364 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
365 blt cr1,shortcopy ; special case short operands
366 crclr noncache ; Set cached
367 LEXT(memcpy_nop_if_32bit)
368 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
369 beqlr- ; exit if to==from
372 ; Here to copy on 32-bit processors.
374 ; When we move the memory, forward overlays must be handled. We
375 ; also can not use the cache instructions if we are from bcopy_nc.
376 ; We need to preserve R3 because it needs to be returned for memcpy.
377 ; We can be interrupted and lose control here.
383 ; r12 = (dest - source)
384 ; cr5 = noncache flag
386 copyit32: ; WARNING! can drop down to this label
387 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
388 cntlzw r11,r5 ; get magnitude of length
389 dcbt 0,r6 ; start to touch in source
390 lis r10,hi16(0x80000000) ; get 0x80000000
391 neg r9,r4 ; start to get alignment for destination
392 dcbtst 0,r4 ; start to touch in destination
393 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
394 blt- cr1,reverse32bit ; reverse move required
396 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
397 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
398 ; word aligned. We depend on this in the uncached case on 64-bit processors.
402 ; r8 = inverse of largest mask smaller than operand length
403 ; r9 = neg(dest), used to compute alignment
404 ; cr5 = noncache flag
406 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
407 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
408 andc. r0,r7,r8 ; limit to the maximum front end move
409 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
410 beq alline ; Already on a line...
412 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
413 sub r5,r5,r0 ; Set the length left to move
415 bf 31,alhalf ; No single byte to do...
416 lbz r7,0(r6) ; Get the byte
417 addi r6,r6,1 ; Point to the next
418 stb r7,0(r4) ; Save the single
419 addi r4,r4,1 ; Bump sink
421 ; Sink is halfword aligned here
423 alhalf: bf 30,alword ; No halfword to do...
424 lhz r7,0(r6) ; Get the halfword
425 addi r6,r6,2 ; Point to the next
426 sth r7,0(r4) ; Save the halfword
427 addi r4,r4,2 ; Bump sink
429 ; Sink is word aligned here
431 alword: bf 29,aldouble ; No word to do...
432 lwz r7,0(r6) ; Get the word
433 addi r6,r6,4 ; Point to the next
434 stw r7,0(r4) ; Save the word
435 addi r4,r4,4 ; Bump sink
437 ; Sink is double aligned here
439 aldouble: bf 28,alquad ; No double to do...
440 lwz r7,0(r6) ; Get the first word
441 lwz r8,4(r6) ; Get the second word
442 addi r6,r6,8 ; Point to the next
443 stw r7,0(r4) ; Save the first word
444 stw r8,4(r4) ; Save the second word
445 addi r4,r4,8 ; Bump sink
447 ; Sink is quadword aligned here
449 alquad: bf 27,alline ; No quad to do...
450 lwz r7,0(r6) ; Get the first word
451 lwz r8,4(r6) ; Get the second word
452 lwz r9,8(r6) ; Get the third word
453 stw r7,0(r4) ; Save the first word
454 lwz r11,12(r6) ; Get the fourth word
455 addi r6,r6,16 ; Point to the next
456 stw r8,4(r4) ; Save the second word
457 stw r9,8(r4) ; Save the third word
458 stw r11,12(r4) ; Save the fourth word
459 addi r4,r4,16 ; Bump sink
461 ; Sink is line aligned here
463 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
464 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
465 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
466 beq- backend ; No full lines to move
468 mtctr r0 ; set up loop count
469 li r0,96 ; Stride for touch ahead
474 lwz r2,0(r6) ; Get the first word
475 lwz r5,4(r6) ; Get the second word
476 lwz r7,8(r6) ; Get the third word
477 lwz r8,12(r6) ; Get the fourth word
478 lwz r9,16(r6) ; Get the fifth word
479 lwz r10,20(r6) ; Get the sixth word
480 lwz r11,24(r6) ; Get the seventh word
481 lwz r12,28(r6) ; Get the eighth word
482 bt- noncache,skipz ; Skip if we are not cached...
483 dcbz 0,r4 ; Blow away the whole line because we are replacing it
484 dcbt r6,r0 ; Touch ahead a bit
486 addi r6,r6,32 ; Point to the next
487 stw r2,0(r4) ; Save the first word
488 stw r5,4(r4) ; Save the second word
489 stw r7,8(r4) ; Save the third word
490 stw r8,12(r4) ; Save the fourth word
491 stw r9,16(r4) ; Save the fifth word
492 stw r10,20(r4) ; Save the sixth word
493 stw r11,24(r4) ; Save the seventh word
494 stw r12,28(r4) ; Save the eighth word
495 addi r4,r4,32 ; Bump sink
496 bdnz+ nxtline ; Do the next line, if any...
499 ; Move backend quadword
501 backend: ; Join here from "shortcopy" for forward moves <32 bytes
502 bf 27,noquad ; No quad to do...
503 lwz r7,0(r6) ; Get the first word
504 lwz r8,4(r6) ; Get the second word
505 lwz r9,8(r6) ; Get the third word
506 lwz r11,12(r6) ; Get the fourth word
507 stw r7,0(r4) ; Save the first word
508 addi r6,r6,16 ; Point to the next
509 stw r8,4(r4) ; Save the second word
510 stw r9,8(r4) ; Save the third word
511 stw r11,12(r4) ; Save the fourth word
512 addi r4,r4,16 ; Bump sink
514 ; Move backend double
516 noquad: bf 28,nodouble ; No double to do...
517 lwz r7,0(r6) ; Get the first word
518 lwz r8,4(r6) ; Get the second word
519 addi r6,r6,8 ; Point to the next
520 stw r7,0(r4) ; Save the first word
521 stw r8,4(r4) ; Save the second word
522 addi r4,r4,8 ; Bump sink
526 nodouble: bf 29,noword ; No word to do...
527 lwz r7,0(r6) ; Get the word
528 addi r6,r6,4 ; Point to the next
529 stw r7,0(r4) ; Save the word
530 addi r4,r4,4 ; Bump sink
532 ; Move backend halfword
534 noword: bf 30,nohalf ; No halfword to do...
535 lhz r7,0(r6) ; Get the halfword
536 addi r6,r6,2 ; Point to the next
537 sth r7,0(r4) ; Save the halfword
538 addi r4,r4,2 ; Bump sink
542 nohalf: bflr 31 ; Leave cuz we are all done...
543 lbz r7,0(r6) ; Get the byte
544 stb r7,0(r4) ; Save the single
548 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
549 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
550 ; word aligned. We depend on this in the uncached case on 64-bit processors.
551 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
555 ; r8 = inverse of largest mask smaller than operand length
556 ; cr5 = noncache flag (but we don't dcbz anyway)
558 reverse32bit: ; here from 64-bit code with word aligned uncached operands
559 add r4,r5,r4 ; Point past the last sink byte
560 add r6,r5,r6 ; Point past the last source byte
561 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
562 li r12,-1 ; Make sure we touch in the actual line
563 andc. r0,r7,r8 ; Apply movement limit
564 dcbt r12,r6 ; Touch in the last line of source
565 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
566 dcbtst r12,r4 ; Touch in the last line of the sink
567 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
568 beq- balline ; Aready on cache line boundary (or too short to bother)
570 sub r5,r5,r0 ; Precaculate move length left after alignment
572 bf 31,balhalf ; No single byte to do...
573 lbz r7,-1(r6) ; Get the byte
574 subi r6,r6,1 ; Point to the next
575 stb r7,-1(r4) ; Save the single
576 subi r4,r4,1 ; Bump sink
578 ; Sink is halfword aligned here
580 balhalf: bf 30,balword ; No halfword to do...
581 lhz r7,-2(r6) ; Get the halfword
582 subi r6,r6,2 ; Point to the next
583 sth r7,-2(r4) ; Save the halfword
584 subi r4,r4,2 ; Bump sink
586 ; Sink is word aligned here
588 balword: bf 29,baldouble ; No word to do...
589 lwz r7,-4(r6) ; Get the word
590 subi r6,r6,4 ; Point to the next
591 stw r7,-4(r4) ; Save the word
592 subi r4,r4,4 ; Bump sink
594 ; Sink is double aligned here
596 baldouble: bf 28,balquad ; No double to do...
597 lwz r7,-8(r6) ; Get the first word
598 lwz r8,-4(r6) ; Get the second word
599 subi r6,r6,8 ; Point to the next
600 stw r7,-8(r4) ; Save the first word
601 stw r8,-4(r4) ; Save the second word
602 subi r4,r4,8 ; Bump sink
604 ; Sink is quadword aligned here
606 balquad: bf 27,balline ; No quad to do...
607 lwz r7,-16(r6) ; Get the first word
608 lwz r8,-12(r6) ; Get the second word
609 lwz r9,-8(r6) ; Get the third word
610 lwz r11,-4(r6) ; Get the fourth word
611 stw r7,-16(r4) ; Save the first word
612 subi r6,r6,16 ; Point to the next
613 stw r8,-12(r4) ; Save the second word
614 stw r9,-8(r4) ; Save the third word
615 stw r11,-4(r4) ; Save the fourth word
616 subi r4,r4,16 ; Bump sink
618 ; Sink is line aligned here
620 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
621 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
622 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
623 beq- bbackend ; No full lines to move
624 mtctr r0 ; set up loop count
629 lwz r7,-32(r6) ; Get the first word
630 lwz r5,-28(r6) ; Get the second word
631 lwz r2,-24(r6) ; Get the third word
632 lwz r12,-20(r6) ; Get the third word
633 lwz r11,-16(r6) ; Get the fifth word
634 lwz r10,-12(r6) ; Get the sixth word
635 lwz r9,-8(r6) ; Get the seventh word
636 lwz r8,-4(r6) ; Get the eighth word
637 subi r6,r6,32 ; Point to the next
639 stw r7,-32(r4) ; Get the first word
640 stw r5,-28(r4) ; Get the second word
641 stw r2,-24(r4) ; Get the third word
642 stw r12,-20(r4) ; Get the third word
643 stw r11,-16(r4) ; Get the fifth word
644 stw r10,-12(r4) ; Get the sixth word
645 stw r9,-8(r4) ; Get the seventh word
646 stw r8,-4(r4) ; Get the eighth word
647 subi r4,r4,32 ; Bump sink
649 bdnz+ bnxtline ; Do the next line, if any...
652 ; Note: We touched these lines in at the beginning
655 ; Move backend quadword
657 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
658 bf 27,bnoquad ; No quad to do...
659 lwz r7,-16(r6) ; Get the first word
660 lwz r8,-12(r6) ; Get the second word
661 lwz r9,-8(r6) ; Get the third word
662 lwz r11,-4(r6) ; Get the fourth word
663 stw r7,-16(r4) ; Save the first word
664 subi r6,r6,16 ; Point to the next
665 stw r8,-12(r4) ; Save the second word
666 stw r9,-8(r4) ; Save the third word
667 stw r11,-4(r4) ; Save the fourth word
668 subi r4,r4,16 ; Bump sink
670 ; Move backend double
672 bnoquad: bf 28,bnodouble ; No double to do...
673 lwz r7,-8(r6) ; Get the first word
674 lwz r8,-4(r6) ; Get the second word
675 subi r6,r6,8 ; Point to the next
676 stw r7,-8(r4) ; Save the first word
677 stw r8,-4(r4) ; Save the second word
678 subi r4,r4,8 ; Bump sink
682 bnodouble: bf 29,bnoword ; No word to do...
683 lwz r7,-4(r6) ; Get the word
684 subi r6,r6,4 ; Point to the next
685 stw r7,-4(r4) ; Save the word
686 subi r4,r4,4 ; Bump sink
688 ; Move backend halfword
690 bnoword: bf 30,bnohalf ; No halfword to do...
691 lhz r7,-2(r6) ; Get the halfword
692 subi r6,r6,2 ; Point to the next
693 sth r7,-2(r4) ; Save the halfword
694 subi r4,r4,2 ; Bump sink
698 bnohalf: bflr 31 ; Leave cuz we are all done...
699 lbz r7,-1(r6) ; Get the byte
700 stb r7,-1(r4) ; Save the single
704 // Here on 64-bit processors, which have a 128-byte cache line. This can be
705 // called either in 32 or 64-bit mode, which makes the test for reverse moves
706 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
710 // r4 = destination (32 or 64-bit ptr)
711 // r5 = length (always 32 bits)
712 // r6 = source (32 or 64-bit ptr)
713 // r12 = (dest - source), reverse move required if (dest-source)<length
714 // cr5 = noncache flag
718 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
719 cntlzw r11,r5 // get magnitude of length
720 dcbt 0,r6 // touch in 1st block of source
721 dcbtst 0,r4 // touch in 1st destination cache block
722 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
724 lis r10,hi16(0x80000000)// get 0x80000000
725 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
726 neg r9,r4 // start to get alignment for destination
727 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
728 bt-- noncache,c64uncached// skip if uncached
729 beq-- c64rdouble // handle cached reverse moves
732 // Forward, cached or doubleword aligned uncached. This is the common case.
733 // NOTE: we never do an unaligned access if the source and destination are "relatively"
734 // doubleword aligned. We depend on this in the uncached case.
738 // r8 = inverse of largest mask smaller than operand length
739 // r9 = neg(dest), used to compute alignment
740 // cr5 = noncache flag
743 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
744 andc r7,r7,r8 // limit by operand length
745 andi. r8,r7,7 // r8 <- #bytes to doubleword align
746 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
747 sub r5,r5,r7 // adjust length remaining
748 cmpwi cr1,r9,0 // any doublewords to move to cache align?
749 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
750 cmpwi cr7,r10,0 // set cr7 on chunk count
751 beq c64double2 // dest already doubleword aligned
755 .align 5 // align inner loops
756 c64double1: // copy bytes until dest is doubleword aligned
763 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
764 beq cr1,c64double4 // no doublewords to xfer in order to cache align
768 .align 5 // align inner loops
769 c64double3: // copy doublewords until dest is 128-byte aligned
776 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
777 // data (64 bytes), we load/store each twice per 128-byte chunk.
779 c64double4: // r10/cr7=128-byte chunks
780 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
781 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
782 beq cr7,c64double7 // no 128-byte chunks
784 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
785 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
787 sub r8,r6,r4 // r8 <- (source - dest)
788 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
789 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
793 .align 5 // align inner loop
794 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
795 ld r0,0(r6) // start pipe: load 1st half-line
803 bt noncache,c64InnerLoop1 // skip if uncached or overlap
804 dcbz128 0,r4 // avoid prefetch of next cache line
816 ld r0,64(r6) // load 2nd half of chunk
834 addi r4,r4,128 // advance to next dest chunk
836 bdnz c64InnerLoop // loop if more chunks
839 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
840 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
841 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
842 beq cr1,c64byte // no leftover doublewords
846 .align 5 // align inner loop
847 c64double8: // loop copying leftover doublewords
855 // Forward byte loop.
857 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
858 beqlr // done if no leftover bytes
862 .align 5 // align inner loop
873 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
874 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
875 // a byte at a time, but that is still much faster than alignment exceptions.
879 // r8 = inverse of largest mask smaller than operand length
880 // r9 = neg(dest), used to compute alignment
881 // r12 = (dest-source), used to test relative alignment
882 // cr0 = beq if reverse move required
883 // cr5 = noncache flag
886 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
887 rlwinm r11,r12,0,30,31 // relatively word aligned?
888 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
889 cmpwi cr1,r11,0 // set cr1 beq if word aligned
890 beq-- c64reverseUncached
892 beq cr7,c64double // doubleword aligned
893 beq cr1,forward32bit // word aligned, use G3/G4 code
894 cmpwi r5,0 // set cr0 on byte count
895 b c64byte // unaligned operands
898 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
899 beq cr1,reverse32bit // word aligned, use G3/G4 code
900 add r6,r6,r5 // point to (end+1) of source and dest
902 cmpwi r5,0 // set cr0 on length
903 b c64rbyte // copy a byte at a time
907 // Reverse doubleword copies. This is used for all cached copies, and doubleword
908 // aligned uncached copies.
912 // r8 = inverse of largest mask of low-order 1s smaller than operand length
913 // cr5 = noncache flag
916 add r6,r6,r5 // point to (end+1) of source and dest
918 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
919 andc. r7,r7,r8 // limit by operand length
920 sub r5,r5,r7 // adjust length
921 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
922 cmpwi cr1,r8,0 // any chunks?
923 beq c64rd2 // source already doubleword aligned
926 c64rd1: // copy bytes until source doublword aligned
931 c64rd2: // r8/cr1 <- count of 64-byte chunks
932 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
933 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
934 cmpwi cr7,r0,0 // leftover doublewords?
935 beq cr1,c64rd4 // no chunks to xfer
939 .align 5 // align inner loop
940 c64rd3: // loop copying 64-byte chunks
959 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
960 beq cr7,c64rbyte // no leftover doublewords
963 c64rd5: // loop copying leftover doublewords
969 // Reverse byte loop.
971 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
972 beqlr // done if no leftover bytes