2 * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
23 ; Copy bytes of data around. Handles overlapped data.
27 #include <ppc/proc_reg.h>
30 ; These routines use CR5 for certain flags:
31 ; Use CR5_lt to indicate non-cached (in bcopy and memcpy)
35 ; The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
36 #define BCOPY_SF_SIZE 32 // total size
37 #define BCOPY_SF_MSR 16 // we save caller's MSR here (possibly minus VEC and FP)
40 #define kShort 32 // short operands are special cased
43 ; void bcopy_physvir_32(from, to, nbytes)
45 ; Attempt to copy physically addressed memory with translation on if conditions are met.
46 ; Otherwise do a normal bcopy_phys. This routine is used because some 32-bit processors
47 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
48 ; for the passed phys addrs and do the copy with translation on.
50 ; Rules are: - neither source nor destination can cross a page.
51 ; - Interrupts must be disabled when this routine is called.
52 ; - Translation must be on when called.
54 ; To do the copy, we build a 128 DBAT for both the source and sink. If both are the same, only one
55 ; is loaded. We do not touch the IBATs, so there is no issue if either physical page
56 ; address is the same as the virtual address of the instructions we are executing.
58 ; At the end, we invalidate the used DBATs.
60 ; Note that the address parameters are long longs. We will transform these to 64-bit
61 ; values. Note that on 32-bit architectures that this will ignore the high half of the
62 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
65 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
66 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
67 ; for an example of how this is done.
70 .globl EXT(bcopy_physvir_32)
72 LEXT(bcopy_physvir_32)
73 mflr r0 ; get return address
74 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
75 mfsprg r8,2 ; get processor feature flags
76 stw r0,8(r1) ; save return address
77 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
78 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
79 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
80 subi r0,r7,1 ; get length - 1
81 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
82 add r11,r3,r0 ; Point to last byte of sink
83 mr r5,r7 ; Get the length into the right register
84 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
86 ; This test for page overflow may not work if the length is negative. Negative lengths are invalid input
87 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
89 add r12,r4,r0 ; Point to last byte of source
90 xor r7,r11,r3 ; See if we went to next page
91 xor r8,r12,r4 ; See if we went to next page
92 or r0,r7,r8 ; Combine wrap
94 // li r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
95 li r9,((2<<3)|2) ; Set default attributes
96 rlwinm. r0,r0,0,0,19 ; Did we overflow a page?
97 li r7,2 ; Set validity flags
98 li r8,2 ; Set validity flags
99 bne- bcopy_phys1 ; Overflowed page, do normal physical copy...
101 rlwimi r11,r9,0,15,31 ; Set sink lower DBAT value
102 rlwimi r12,r9,0,15,31 ; Set source lower DBAT value
103 rlwimi r7,r11,0,0,14 ; Set sink upper DBAT value
104 rlwimi r8,r12,0,0,14 ; Set source upper DBAT value
105 cmplw cr1,r11,r12 ; See if sink and source are same block
109 mtdbatl 0,r11 ; Set sink lower DBAT
110 mtdbatu 0,r7 ; Set sink upper DBAT
112 beq- cr1,bcpvsame ; Source and sink are in same block
114 mtdbatl 1,r12 ; Set source lower DBAT
115 mtdbatu 1,r8 ; Set source upper DBAT
118 sync ; wait for the BATs to stabilize
121 bl EXT(bcopy) ; BATs set up, args in r3-r5, so do the copy with DR on
123 li r0,0 ; Get set to invalidate upper half of BATs
124 sync ; Make sure all is well
125 mtdbatu 0,r0 ; Clear sink upper DBAT
126 mtdbatu 1,r0 ; Clear source upper DBAT
130 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address
131 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
136 ; void bcopy_phys(from, to, nbytes)
138 ; Turns off data translation before the copy. This one will not work in user state.
139 ; This routine is used on 32 and 64-bit machines.
141 ; Note that the address parameters are long longs. We will transform these to 64-bit
142 ; values. Note that on 32-bit architectures that this will ignore the high half of the
143 ; passed in value. This should be ok since we can not have any bigger than 32 bit addresses
146 ; Also note that you probably will not be happy if either the sink or source spans across the
147 ; boundary between RAM and I/O space. Good chance of hanging the machine and this code
148 ; will not check, so be careful.
150 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
151 ; Interrupts may or may not be disabled.
154 .globl EXT(bcopy_phys)
157 mflr r0 ; get return address
158 rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
160 mfsprg r8,2 ; get processor feature flags
161 stwu r1,-BCOPY_SF_SIZE(r1) ; push on a stack frame so we can call bcopy
162 rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
163 rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
164 mtcrf 0x02,r8 ; move pf64Bit to cr6 so we can test
165 rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
166 mr r5,r7 ; Get the length into the right register
168 bcopy_phys1: ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
169 mfmsr r9 ; Get the MSR
170 lis r6,hi16(MASK(MSR_VEC)) ; Get vector enable
171 ori r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR)) ; Add in FP and DR
172 andc r9,r9,r6 ; unconditionally turn DR, VEC, and FP off
173 bt++ pf64Bitb,bcopy_phys64 ; skip if 64-bit (only they take hint)
177 mtmsr r9 ; turn DR, FP, and VEC off
180 bl EXT(bcopy) ; do the copy with translation off and caching on
182 mfmsr r9 ; Get the MSR
183 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on (but leave VEC and FP off)
184 mtmsr r9 ; restore msr
185 isync ; wait for it to happen
186 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
188 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
192 ; 64-bit: turn DR off and SF on.
194 bcopy_phys64: ; r9 = MSR with DP, VEC, and FP off
195 ori r8,r9,lo16(MASK(MSR_DR)) ; make a copy with DR back on... this is what we return to caller
196 srdi r2,r3,31 ; Get a 1 if source is in I/O memory
197 li r0,1 ; Note - we use this in a couple places below
198 srdi r10,r4,31 ; Get a 1 if sink is in I/O memory
199 std r8,BCOPY_SF_MSR(r1) ; save caller's MSR so we remember whether EE was on
200 rldimi r9,r0,63,MSR_SF_BIT ; set SF on in MSR we will copy with
201 cmpldi cr0,r2,1 ; Is source in I/O memory?
202 cmpldi cr7,r10,1 ; Is sink in I/O memory?
203 mtmsrd r9 ; turn 64-bit addressing on, data translation off
204 isync ; wait for it to happen
205 cror cr7_eq,cr0_eq,cr7_eq ; See if either source or sink is in I/O area
206 beq-- cr7,io_space_real_mode_copy ; an operand is in I/O space
208 bl EXT(bcopy) ; do copy with DR off and SF on, cache enabled
211 mfmsr r9 ; Get the MSR we used to copy
212 rldicl r9,r9,0,MSR_SF_BIT+1 ; clear SF
213 ori r9,r9,lo16(MASK(MSR_DR)) ; turn translation back on
214 mtmsrd r9 ; turn 64-bit mode off, translation back on
215 isync ; wait for it to happen
216 lwz r0,BCOPY_SF_SIZE+8(r1) ; get return address once translation is back on
217 ld r8,BCOPY_SF_MSR(r1) ; get caller's MSR once translation is back on
219 mtmsrd r8,1 ; turn EE back on if necessary
220 addi r1,r1,BCOPY_SF_SIZE ; pop off stack frame
223 ; We need to copy with DR off, but one of the operands is in I/O space. To avoid wedging U3,
224 ; which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
225 ; This can only be done by setting bits in HID4. We cannot lose control and execute random code in
226 ; this state, so we have to disable interrupts as well. This is an unpleasant hack.
228 io_space_real_mode_copy: ; r0=1, r9=MSR we want to copy with
229 sldi r11,r0,31-MSR_EE_BIT ; Get a mask for the EE bit
230 sldi r0,r0,32+8 ; Get the right bit to turn off caching
231 andc r9,r9,r11 ; Turn off EE bit
232 mfspr r2,hid4 ; Get HID4
233 mtmsrd r9,1 ; Force off EE
234 or r2,r2,r0 ; Set bit to make real accesses cache-inhibited
236 mtspr hid4,r2 ; Make real accesses cache-inhibited
237 isync ; Toss prefetches
239 lis r12,0xE000 ; Get the unlikeliest ESID possible
240 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
241 slbie r12 ; Make sure the ERAT is cleared
246 bl EXT(bcopy_nc) ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
249 sldi r0,r0,32+8 ; Get the right bit to turn off caching
250 mfspr r2,hid4 ; Get HID4
251 andc r2,r2,r0 ; Clear bit to make real accesses cache-inhibited
253 mtspr hid4,r2 ; Make real accesses not cache-inhibited
254 isync ; Toss prefetches
256 lis r12,0xE000 ; Get the unlikeliest ESID possible
257 srdi r12,r12,1 ; Make 0x7FFFFFFFF0000000
258 slbie r12 ; Make sure the ERAT is cleared
265 ; Special case short operands (<32 bytes), which are very common. Note that the check for
266 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
267 ; reverse when it wasn't necessary to do so. This is OK, since performance of the two cases
268 ; is similar. We do get the direction right when it counts (ie, when the operands overlap.)
269 ; Also note that we use the G3/G4 "backend" code, even on G5. This is OK too, since G5 has
270 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
271 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
272 ; which cost about 20 cycles if they cross a 32-byte boundary on G5. Finally, because we
273 ; might do unaligned accesses this code cannot be called from bcopy_nc().
277 ; r12 = (dest - source)
281 cmplw r12,r5 ; must move reverse if (dest-source)<length
282 mtcrf 2,r5 ; move length to cr6 and cr7 one at a time...
283 mtcrf 1,r5 ; ...which is faster on G4 and G5
284 bge++ backend ; handle forward moves (most common case)
285 add r6,r6,r5 ; point one past end of operands in reverse moves
287 b bbackend ; handle reverse moves
290 ; void bcopy(from, to, nbytes)
292 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
293 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
294 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
295 ; to the thread_recover routine. What this means is that it would be hard to use vector or floating point
296 ; registers to accelerate the copy.
298 ; NOTE: this code can be called in any of three "modes":
299 ; - on 32-bit processors (32-byte cache line)
300 ; - on 64-bit processors running in 32-bit mode (128-byte cache line)
301 ; - on 64-bit processors running in 64-bit mode (128-byte cache line)
305 .globl EXT(bcopy_nop_if_32bit)
308 cmplwi cr1,r5,kShort ; less than 32 bytes?
309 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
310 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
311 blt cr1,shortcopy ; special case short operands
312 crclr noncache ; Set cached
313 LEXT(bcopy_nop_if_32bit)
314 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
315 bne+ copyit32 ; handle 32-bit processor
316 blr ; to==from so nothing to do
319 ; bcopy_nc(from, to, nbytes)
321 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
322 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
323 ; alignment exceptions. Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
324 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
328 .globl EXT(bcopy_nc_nop_if_32bit)
331 cmpwi cr1,r5,0 ; Check if we have a 0 length
332 sub. r12,r4,r3 ; test for to==from in mode-independent way, start fwd/rev check
333 mr r6,r3 ; Set source (must preserve r3 for memcopy return)
334 crset noncache ; Set non-cached
335 cror cr0_eq,cr1_eq,cr0_eq ; set cr0 beq if either length zero or to==from
336 LEXT(bcopy_nc_nop_if_32bit)
337 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
338 bne+ copyit32 ; handle 32-bit processor
339 blr ; either zero length or to==from
342 ; void* memcpy(to, from, nbytes)
343 ; void* memmove(to, from, nbytes)
345 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
346 ; However, they would work correctly if called in 64-bit mode.
351 .globl EXT(memcpy_nop_if_32bit)
355 cmplwi cr1,r5,kShort ; less than 32 bytes?
356 sub. r12,r3,r4 ; test for to==from in mode-independent way, start fwd/rev check
357 mr r6,r4 ; Set source
358 mr r4,r3 ; Set the "to" (must preserve r3 for return value)
359 blt cr1,shortcopy ; special case short operands
360 crclr noncache ; Set cached
361 LEXT(memcpy_nop_if_32bit)
362 bne++ copyit64 ; handle 64-bit processor (patched to NOP if 32-bit processor)
363 beqlr- ; exit if to==from
366 ; Here to copy on 32-bit processors.
368 ; When we move the memory, forward overlays must be handled. We
369 ; also can not use the cache instructions if we are from bcopy_nc.
370 ; We need to preserve R3 because it needs to be returned for memcpy.
371 ; We can be interrupted and lose control here.
377 ; r12 = (dest - source)
378 ; cr5 = noncache flag
380 copyit32: ; WARNING! can drop down to this label
381 cmplw cr1,r12,r5 ; must move reverse if (dest-source)<length
382 cntlzw r11,r5 ; get magnitude of length
383 dcbt 0,r6 ; start to touch in source
384 lis r10,hi16(0x80000000) ; get 0x80000000
385 neg r9,r4 ; start to get alignment for destination
386 dcbtst 0,r4 ; start to touch in destination
387 sraw r8,r10,r11 ; get mask based on operand length, to limit alignment
388 blt- cr1,reverse32bit ; reverse move required
390 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
391 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
392 ; word aligned. We depend on this in the uncached case on 64-bit processors.
396 ; r8 = inverse of largest mask smaller than operand length
397 ; r9 = neg(dest), used to compute alignment
398 ; cr5 = noncache flag
400 forward32bit: ; enter from 64-bit CPUs with word aligned uncached operands
401 rlwinm r7,r9,0,0x1F ; get bytes to 32-byte-align destination
402 andc. r0,r7,r8 ; limit to the maximum front end move
403 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
404 beq alline ; Already on a line...
406 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
407 sub r5,r5,r0 ; Set the length left to move
409 bf 31,alhalf ; No single byte to do...
410 lbz r7,0(r6) ; Get the byte
411 addi r6,r6,1 ; Point to the next
412 stb r7,0(r4) ; Save the single
413 addi r4,r4,1 ; Bump sink
415 ; Sink is halfword aligned here
417 alhalf: bf 30,alword ; No halfword to do...
418 lhz r7,0(r6) ; Get the halfword
419 addi r6,r6,2 ; Point to the next
420 sth r7,0(r4) ; Save the halfword
421 addi r4,r4,2 ; Bump sink
423 ; Sink is word aligned here
425 alword: bf 29,aldouble ; No word to do...
426 lwz r7,0(r6) ; Get the word
427 addi r6,r6,4 ; Point to the next
428 stw r7,0(r4) ; Save the word
429 addi r4,r4,4 ; Bump sink
431 ; Sink is double aligned here
433 aldouble: bf 28,alquad ; No double to do...
434 lwz r7,0(r6) ; Get the first word
435 lwz r8,4(r6) ; Get the second word
436 addi r6,r6,8 ; Point to the next
437 stw r7,0(r4) ; Save the first word
438 stw r8,4(r4) ; Save the second word
439 addi r4,r4,8 ; Bump sink
441 ; Sink is quadword aligned here
443 alquad: bf 27,alline ; No quad to do...
444 lwz r7,0(r6) ; Get the first word
445 lwz r8,4(r6) ; Get the second word
446 lwz r9,8(r6) ; Get the third word
447 stw r7,0(r4) ; Save the first word
448 lwz r11,12(r6) ; Get the fourth word
449 addi r6,r6,16 ; Point to the next
450 stw r8,4(r4) ; Save the second word
451 stw r9,8(r4) ; Save the third word
452 stw r11,12(r4) ; Save the fourth word
453 addi r4,r4,16 ; Bump sink
455 ; Sink is line aligned here
457 alline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
458 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
459 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
460 beq- backend ; No full lines to move
462 mtctr r0 ; set up loop count
463 li r0,96 ; Stride for touch ahead
468 lwz r2,0(r6) ; Get the first word
469 lwz r5,4(r6) ; Get the second word
470 lwz r7,8(r6) ; Get the third word
471 lwz r8,12(r6) ; Get the fourth word
472 lwz r9,16(r6) ; Get the fifth word
473 lwz r10,20(r6) ; Get the sixth word
474 lwz r11,24(r6) ; Get the seventh word
475 lwz r12,28(r6) ; Get the eighth word
476 bt- noncache,skipz ; Skip if we are not cached...
477 dcbz 0,r4 ; Blow away the whole line because we are replacing it
478 dcbt r6,r0 ; Touch ahead a bit
480 addi r6,r6,32 ; Point to the next
481 stw r2,0(r4) ; Save the first word
482 stw r5,4(r4) ; Save the second word
483 stw r7,8(r4) ; Save the third word
484 stw r8,12(r4) ; Save the fourth word
485 stw r9,16(r4) ; Save the fifth word
486 stw r10,20(r4) ; Save the sixth word
487 stw r11,24(r4) ; Save the seventh word
488 stw r12,28(r4) ; Save the eighth word
489 addi r4,r4,32 ; Bump sink
490 bdnz+ nxtline ; Do the next line, if any...
493 ; Move backend quadword
495 backend: ; Join here from "shortcopy" for forward moves <32 bytes
496 bf 27,noquad ; No quad to do...
497 lwz r7,0(r6) ; Get the first word
498 lwz r8,4(r6) ; Get the second word
499 lwz r9,8(r6) ; Get the third word
500 lwz r11,12(r6) ; Get the fourth word
501 stw r7,0(r4) ; Save the first word
502 addi r6,r6,16 ; Point to the next
503 stw r8,4(r4) ; Save the second word
504 stw r9,8(r4) ; Save the third word
505 stw r11,12(r4) ; Save the fourth word
506 addi r4,r4,16 ; Bump sink
508 ; Move backend double
510 noquad: bf 28,nodouble ; No double to do...
511 lwz r7,0(r6) ; Get the first word
512 lwz r8,4(r6) ; Get the second word
513 addi r6,r6,8 ; Point to the next
514 stw r7,0(r4) ; Save the first word
515 stw r8,4(r4) ; Save the second word
516 addi r4,r4,8 ; Bump sink
520 nodouble: bf 29,noword ; No word to do...
521 lwz r7,0(r6) ; Get the word
522 addi r6,r6,4 ; Point to the next
523 stw r7,0(r4) ; Save the word
524 addi r4,r4,4 ; Bump sink
526 ; Move backend halfword
528 noword: bf 30,nohalf ; No halfword to do...
529 lhz r7,0(r6) ; Get the halfword
530 addi r6,r6,2 ; Point to the next
531 sth r7,0(r4) ; Save the halfword
532 addi r4,r4,2 ; Bump sink
536 nohalf: bflr 31 ; Leave cuz we are all done...
537 lbz r7,0(r6) ; Get the byte
538 stb r7,0(r4) ; Save the single
542 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
543 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
544 ; word aligned. We depend on this in the uncached case on 64-bit processors.
545 ; These are slower because we don't bother with dcbz. Fortunately, reverse moves are uncommon.
549 ; r8 = inverse of largest mask smaller than operand length
550 ; cr5 = noncache flag (but we don't dcbz anyway)
552 reverse32bit: ; here from 64-bit code with word aligned uncached operands
553 add r4,r5,r4 ; Point past the last sink byte
554 add r6,r5,r6 ; Point past the last source byte
555 rlwinm r7,r4,0,0x1F ; Calculate the length to align dest on cache boundary
556 li r12,-1 ; Make sure we touch in the actual line
557 andc. r0,r7,r8 ; Apply movement limit
558 dcbt r12,r6 ; Touch in the last line of source
559 mtcrf 0x01,r0 ; move length to cr6 and cr7 one cr at a time...
560 dcbtst r12,r4 ; Touch in the last line of the sink
561 mtcrf 0x02,r0 ; ...since moving more than one is slower on G4 and G5
562 beq- balline ; Aready on cache line boundary (or too short to bother)
564 sub r5,r5,r0 ; Precaculate move length left after alignment
566 bf 31,balhalf ; No single byte to do...
567 lbz r7,-1(r6) ; Get the byte
568 subi r6,r6,1 ; Point to the next
569 stb r7,-1(r4) ; Save the single
570 subi r4,r4,1 ; Bump sink
572 ; Sink is halfword aligned here
574 balhalf: bf 30,balword ; No halfword to do...
575 lhz r7,-2(r6) ; Get the halfword
576 subi r6,r6,2 ; Point to the next
577 sth r7,-2(r4) ; Save the halfword
578 subi r4,r4,2 ; Bump sink
580 ; Sink is word aligned here
582 balword: bf 29,baldouble ; No word to do...
583 lwz r7,-4(r6) ; Get the word
584 subi r6,r6,4 ; Point to the next
585 stw r7,-4(r4) ; Save the word
586 subi r4,r4,4 ; Bump sink
588 ; Sink is double aligned here
590 baldouble: bf 28,balquad ; No double to do...
591 lwz r7,-8(r6) ; Get the first word
592 lwz r8,-4(r6) ; Get the second word
593 subi r6,r6,8 ; Point to the next
594 stw r7,-8(r4) ; Save the first word
595 stw r8,-4(r4) ; Save the second word
596 subi r4,r4,8 ; Bump sink
598 ; Sink is quadword aligned here
600 balquad: bf 27,balline ; No quad to do...
601 lwz r7,-16(r6) ; Get the first word
602 lwz r8,-12(r6) ; Get the second word
603 lwz r9,-8(r6) ; Get the third word
604 lwz r11,-4(r6) ; Get the fourth word
605 stw r7,-16(r4) ; Save the first word
606 subi r6,r6,16 ; Point to the next
607 stw r8,-12(r4) ; Save the second word
608 stw r9,-8(r4) ; Save the third word
609 stw r11,-4(r4) ; Save the fourth word
610 subi r4,r4,16 ; Bump sink
612 ; Sink is line aligned here
614 balline: rlwinm. r0,r5,27,5,31 ; Get the number of full lines to move
615 mtcrf 0x02,r5 ; move length to cr6 and cr7 one cr at a time...
616 mtcrf 0x01,r5 ; ...since moving more than one is slower on G4 and G5
617 beq- bbackend ; No full lines to move
618 mtctr r0 ; set up loop count
623 lwz r7,-32(r6) ; Get the first word
624 lwz r5,-28(r6) ; Get the second word
625 lwz r2,-24(r6) ; Get the third word
626 lwz r12,-20(r6) ; Get the third word
627 lwz r11,-16(r6) ; Get the fifth word
628 lwz r10,-12(r6) ; Get the sixth word
629 lwz r9,-8(r6) ; Get the seventh word
630 lwz r8,-4(r6) ; Get the eighth word
631 subi r6,r6,32 ; Point to the next
633 stw r7,-32(r4) ; Get the first word
634 stw r5,-28(r4) ; Get the second word
635 stw r2,-24(r4) ; Get the third word
636 stw r12,-20(r4) ; Get the third word
637 stw r11,-16(r4) ; Get the fifth word
638 stw r10,-12(r4) ; Get the sixth word
639 stw r9,-8(r4) ; Get the seventh word
640 stw r8,-4(r4) ; Get the eighth word
641 subi r4,r4,32 ; Bump sink
643 bdnz+ bnxtline ; Do the next line, if any...
646 ; Note: We touched these lines in at the beginning
649 ; Move backend quadword
651 bbackend: ; Join here from "shortcopy" for reverse moves of <32 bytes
652 bf 27,bnoquad ; No quad to do...
653 lwz r7,-16(r6) ; Get the first word
654 lwz r8,-12(r6) ; Get the second word
655 lwz r9,-8(r6) ; Get the third word
656 lwz r11,-4(r6) ; Get the fourth word
657 stw r7,-16(r4) ; Save the first word
658 subi r6,r6,16 ; Point to the next
659 stw r8,-12(r4) ; Save the second word
660 stw r9,-8(r4) ; Save the third word
661 stw r11,-4(r4) ; Save the fourth word
662 subi r4,r4,16 ; Bump sink
664 ; Move backend double
666 bnoquad: bf 28,bnodouble ; No double to do...
667 lwz r7,-8(r6) ; Get the first word
668 lwz r8,-4(r6) ; Get the second word
669 subi r6,r6,8 ; Point to the next
670 stw r7,-8(r4) ; Save the first word
671 stw r8,-4(r4) ; Save the second word
672 subi r4,r4,8 ; Bump sink
676 bnodouble: bf 29,bnoword ; No word to do...
677 lwz r7,-4(r6) ; Get the word
678 subi r6,r6,4 ; Point to the next
679 stw r7,-4(r4) ; Save the word
680 subi r4,r4,4 ; Bump sink
682 ; Move backend halfword
684 bnoword: bf 30,bnohalf ; No halfword to do...
685 lhz r7,-2(r6) ; Get the halfword
686 subi r6,r6,2 ; Point to the next
687 sth r7,-2(r4) ; Save the halfword
688 subi r4,r4,2 ; Bump sink
692 bnohalf: bflr 31 ; Leave cuz we are all done...
693 lbz r7,-1(r6) ; Get the byte
694 stb r7,-1(r4) ; Save the single
698 // Here on 64-bit processors, which have a 128-byte cache line. This can be
699 // called either in 32 or 64-bit mode, which makes the test for reverse moves
700 // a little tricky. We've already filtered out the (sou==dest) and (len==0)
704 // r4 = destination (32 or 64-bit ptr)
705 // r5 = length (always 32 bits)
706 // r6 = source (32 or 64-bit ptr)
707 // r12 = (dest - source), reverse move required if (dest-source)<length
708 // cr5 = noncache flag
712 rlwinm r7,r5,0,0,31 // truncate length to 32-bit, in case we're running in 64-bit mode
713 cntlzw r11,r5 // get magnitude of length
714 dcbt 0,r6 // touch in 1st block of source
715 dcbtst 0,r4 // touch in 1st destination cache block
716 subc r7,r12,r7 // set Carry if (dest-source)>=length, in mode-independent way
718 lis r10,hi16(0x80000000)// get 0x80000000
719 addze. r0,r0 // set cr0 on carry bit (beq if reverse move required)
720 neg r9,r4 // start to get alignment for destination
721 sraw r8,r10,r11 // get mask based on operand length, to limit alignment
722 bt-- noncache,c64uncached// skip if uncached
723 beq-- c64rdouble // handle cached reverse moves
726 // Forward, cached or doubleword aligned uncached. This is the common case.
727 // NOTE: we never do an unaligned access if the source and destination are "relatively"
728 // doubleword aligned. We depend on this in the uncached case.
732 // r8 = inverse of largest mask smaller than operand length
733 // r9 = neg(dest), used to compute alignment
734 // cr5 = noncache flag
737 rlwinm r7,r9,0,0x7F // get #bytes to 128-byte align destination
738 andc r7,r7,r8 // limit by operand length
739 andi. r8,r7,7 // r8 <- #bytes to doubleword align
740 srwi r9,r7,3 // r9 <- #doublewords to 128-byte align
741 sub r5,r5,r7 // adjust length remaining
742 cmpwi cr1,r9,0 // any doublewords to move to cache align?
743 srwi r10,r5,7 // r10 <- 128-byte chunks to xfer after aligning dest
744 cmpwi cr7,r10,0 // set cr7 on chunk count
745 beq c64double2 // dest already doubleword aligned
749 .align 5 // align inner loops
750 c64double1: // copy bytes until dest is doubleword aligned
757 c64double2: // r9/cr1=doublewords, r10/cr7=128-byte chunks
758 beq cr1,c64double4 // no doublewords to xfer in order to cache align
762 .align 5 // align inner loops
763 c64double3: // copy doublewords until dest is 128-byte aligned
770 // Here to xfer 128-byte chunks, if any. Since we only have 8 GPRs for
771 // data (64 bytes), we load/store each twice per 128-byte chunk.
773 c64double4: // r10/cr7=128-byte chunks
774 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords, after moving chunks
775 cmpwi cr1,r0,0 // set cr1 on leftover doublewords
776 beq cr7,c64double7 // no 128-byte chunks
778 ; We must check for (source-dest)<128 in a mode-independent way. If within 128 bytes,
779 ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
781 sub r8,r6,r4 // r8 <- (source - dest)
782 rldicr. r0,r8,0,63-7 // zero low 7 bits and check for 0, mode independent
783 cror noncache,cr0_eq,noncache // turn on "noncache" flag if (source-dest)<128
787 .align 5 // align inner loop
788 c64InnerLoop: // loop copying 128-byte cache lines to 128-aligned destination
789 ld r0,0(r6) // start pipe: load 1st half-line
797 bt noncache,c64InnerLoop1 // skip if uncached or overlap
798 dcbz128 0,r4 // avoid prefetch of next cache line
810 ld r0,64(r6) // load 2nd half of chunk
828 addi r4,r4,128 // advance to next dest chunk
830 bdnz c64InnerLoop // loop if more chunks
833 c64double7: // r5 <- leftover bytes, cr1 set on doubleword count
834 rlwinm r0,r5,29,28,31 // r0 <- count of leftover doublewords (0-15)
835 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes (0-7)
836 beq cr1,c64byte // no leftover doublewords
840 .align 5 // align inner loop
841 c64double8: // loop copying leftover doublewords
849 // Forward byte loop.
851 c64byte: // r5/cr0 <- byte count (can be big if unaligned uncached)
852 beqlr // done if no leftover bytes
856 .align 5 // align inner loop
867 // Uncached copies. We must avoid unaligned accesses, since they always take alignment
868 // exceptions on uncached memory on 64-bit processors. This may mean we copy long operands
869 // a byte at a time, but that is still much faster than alignment exceptions.
873 // r8 = inverse of largest mask smaller than operand length
874 // r9 = neg(dest), used to compute alignment
875 // r12 = (dest-source), used to test relative alignment
876 // cr0 = beq if reverse move required
877 // cr5 = noncache flag
880 rlwinm r10,r12,0,29,31 // relatively doubleword aligned?
881 rlwinm r11,r12,0,30,31 // relatively word aligned?
882 cmpwi cr7,r10,0 // set cr7 beq if doubleword aligned
883 cmpwi cr1,r11,0 // set cr1 beq if word aligned
884 beq-- c64reverseUncached
886 beq cr7,c64double // doubleword aligned
887 beq cr1,forward32bit // word aligned, use G3/G4 code
888 cmpwi r5,0 // set cr0 on byte count
889 b c64byte // unaligned operands
892 beq cr7,c64rdouble // doubleword aligned so can use LD/STD
893 beq cr1,reverse32bit // word aligned, use G3/G4 code
894 add r6,r6,r5 // point to (end+1) of source and dest
896 cmpwi r5,0 // set cr0 on length
897 b c64rbyte // copy a byte at a time
901 // Reverse doubleword copies. This is used for all cached copies, and doubleword
902 // aligned uncached copies.
906 // r8 = inverse of largest mask of low-order 1s smaller than operand length
907 // cr5 = noncache flag
910 add r6,r6,r5 // point to (end+1) of source and dest
912 rlwinm r7,r4,0,29,31 // r7 <- #bytes to doubleword align dest
913 andc. r7,r7,r8 // limit by operand length
914 sub r5,r5,r7 // adjust length
915 srwi r8,r5,6 // r8 <- 64-byte chunks to xfer
916 cmpwi cr1,r8,0 // any chunks?
917 beq c64rd2 // source already doubleword aligned
920 c64rd1: // copy bytes until source doublword aligned
925 c64rd2: // r8/cr1 <- count of 64-byte chunks
926 rlwinm r0,r5,29,29,31 // r0 <- count of leftover doublewords
927 andi. r5,r5,7 // r5/cr0 <- count of leftover bytes
928 cmpwi cr7,r0,0 // leftover doublewords?
929 beq cr1,c64rd4 // no chunks to xfer
933 .align 5 // align inner loop
934 c64rd3: // loop copying 64-byte chunks
953 c64rd4: // r0/cr7 = leftover doublewords r5/cr0 = leftover bytes
954 beq cr7,c64rbyte // no leftover doublewords
957 c64rd5: // loop copying leftover doublewords
963 // Reverse byte loop.
965 c64rbyte: // r5/cr0 <- byte count (can be big if unaligned uncached)
966 beqlr // done if no leftover bytes