osfmk/ppc/bcopy.s

   1 /*
   2  * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30 ;
  31 ;                       Copy bytes of data around. Handles overlapped data.
  32 ;
  33 ;
  34 #include <ppc/asm.h>
  35 #include <ppc/proc_reg.h>
  36 #include <assym.s>
  37
  38 ;       These routines use CR5 for certain flags:
  39 ;               Use CR5_lt to indicate non-cached (in bcopy and memcpy)
  40 #define noncache 20
  41
  42
  43 ;       The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
  44 #define BCOPY_SF_SIZE   32      // total size
  45 #define BCOPY_SF_MSR    16      // we save caller's MSR here (possibly minus VEC and FP)
  46
  47
  48 #define kShort  32              // short operands are special cased
  49
  50
  51 ; void bcopy_physvir_32(from, to, nbytes)
  52 ;
  53 ; Attempt to copy physically addressed memory with translation on if conditions are met.
  54 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors
  55 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
  56 ; for the passed phys addrs and do the copy with translation on.
  57 ;
  58 ; Rules are: - neither source nor destination can cross a page.
  59 ;            - Interrupts must be disabled when this routine is called.
  60 ;            - Translation must be on when called.
  61 ;
  62 ; To do the copy, we build a 128 DBAT for both the source and sink.  If both are the same, only one
  63 ; is loaded.  We do not touch the IBATs, so there is no issue if either physical page
  64 ; address is the same as the virtual address of the instructions we are executing.
  65 ;
  66 ; At the end, we invalidate the used DBATs.
  67 ;
  68 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  69 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  70 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  71 ; there anyhow.
  72 ;
  73 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
  74 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
  75 ; for an example of how this is done.
  76
  77                         .align  5
  78                         .globl  EXT(bcopy_physvir_32)
  79
  80 LEXT(bcopy_physvir_32)
  81             mflr    r0                          ; get return address
  82             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  83             mfsprg      r8,2                                            ; get processor feature flags
  84             stw     r0,8(r1)                    ; save return address
  85                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  86             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
  87             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  88             subi    r0,r7,1                     ; get length - 1
  89                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  90                         add             r11,r3,r0                                       ; Point to last byte of sink
  91                         mr              r5,r7                                           ; Get the length into the right register
  92             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  93
  94 ; This test for page overflow may not work if the length is negative.  Negative lengths are invalid input
  95 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
  96
  97                         add             r12,r4,r0                                       ; Point to last byte of source
  98                         xor             r7,r11,r3                                       ; See if we went to next page
  99                         xor             r8,r12,r4                                       ; See if we went to next page
 100                         or              r0,r7,r8                                        ; Combine wrap
 101
 102 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
 103                         li              r9,((2<<3)|2)                           ; Set default attributes
 104                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
 105                         li              r7,2                                            ; Set validity flags
 106                         li              r8,2                                            ; Set validity flags
 107                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
 108
 109                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
 110                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
 111                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
 112                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
 113                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
 114
 115                         sync
 116
 117                         mtdbatl 0,r11                                           ; Set sink lower DBAT
 118                         mtdbatu 0,r7                                            ; Set sink upper DBAT
 119
 120                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
 121
 122                         mtdbatl 1,r12                                           ; Set source lower DBAT
 123                         mtdbatu 1,r8                                            ; Set source upper DBAT
 124
 125 bcpvsame:
 126             sync                                ; wait for the BATs to stabilize
 127             isync
 128
 129             bl      EXT(bcopy)                  ; BATs set up, args in r3-r5, so do the copy with DR on
 130
 131             li          r0,0                                            ; Get set to invalidate upper half of BATs
 132                         sync                                                            ; Make sure all is well
 133                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
 134                         mtdbatu 1,r0                                            ; Clear source upper DBAT
 135                         sync
 136                         isync
 137
 138             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address
 139             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 140             mtlr    r0
 141             blr
 142
 143
 144 ; void bcopy_phys(from, to, nbytes)
 145 ;
 146 ; Turns off data translation before the copy.  This one will not work in user state.
 147 ; This routine is used on 32 and 64-bit machines.
 148 ;
 149 ; Note that the address parameters are long longs.  We will transform these to 64-bit
 150 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
 151 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
 152 ; there anyhow.
 153 ;
 154 ; Also note that you probably will not be happy if either the sink or source spans across the
 155 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code
 156 ; will not check, so be careful.
 157 ;
 158 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
 159 ;       Interrupts may or may not be disabled.
 160
 161                         .align  5
 162                         .globl  EXT(bcopy_phys)
 163
 164 LEXT(bcopy_phys)
 165             mflr    r0                          ; get return address
 166             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 167             stw     r0,8(r1)                    ; save
 168             mfsprg      r8,2                                            ; get processor feature flags
 169             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
 170                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
 171                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 172                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
 173                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
 174                         mr              r5,r7                                           ; Get the length into the right register
 175
 176 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
 177                         mfmsr   r9                                                      ; Get the MSR
 178                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
 179             ori     r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))       ; Add in FP and DR
 180             andc    r9,r9,r6                    ; unconditionally turn DR, VEC, and FP off
 181             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
 182
 183 ; 32-bit CPUs
 184
 185                         mtmsr   r9                                                      ; turn DR, FP, and VEC off
 186                         isync                                                           ; Wait for it
 187
 188             bl      EXT(bcopy)                  ; do the copy with translation off and caching on
 189
 190                         mfmsr   r9                                                      ; Get the MSR
 191             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on (but leave VEC and FP off)
 192             mtmsr   r9                          ; restore msr
 193             isync                               ; wait for it to happen
 194             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 195             mtlr    r0
 196             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 197             blr
 198
 199
 200 ; 64-bit: turn DR off and SF on.
 201
 202 bcopy_phys64:                                                                   ; r9 = MSR with DP, VEC, and FP off
 203             ori     r8,r9,lo16(MASK(MSR_DR))    ; make a copy with DR back on... this is what we return to caller
 204                         srdi    r2,r3,31                                        ; Get a 1 if source is in I/O memory
 205             li          r0,1                                            ; Note - we use this in a couple places below
 206                         srdi    r10,r4,31                                       ; Get a 1 if sink is in I/O memory
 207             std     r8,BCOPY_SF_MSR(r1)         ; save caller's MSR so we remember whether EE was on
 208             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on in MSR we will copy with
 209                         cmpldi  cr0,r2,1                                        ; Is source in I/O memory?
 210                         cmpldi  cr7,r10,1                                       ; Is sink in I/O memory?
 211             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
 212             isync                                                               ; wait for it to happen
 213                         cror    cr7_eq,cr0_eq,cr7_eq            ; See if either source or sink is in I/O area
 214             beq--   cr7,io_space_real_mode_copy ; an operand is in I/O space
 215
 216             bl      EXT(bcopy)                  ; do copy with DR off and SF on, cache enabled
 217
 218 bcopy_phys64x:
 219                         mfmsr   r9                                                      ; Get the MSR we used to copy
 220             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
 221             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on
 222             mtmsrd  r9                          ; turn 64-bit mode off, translation back on
 223             isync                                                               ; wait for it to happen
 224             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 225             ld      r8,BCOPY_SF_MSR(r1)         ; get caller's MSR once translation is back on
 226             mtlr    r0
 227             mtmsrd  r8,1                        ; turn EE back on if necessary
 228             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 229             blr
 230
 231 ;   We need to copy with DR off, but one of the operands is in I/O space.  To avoid wedging U3,
 232 ;   which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
 233 ;   This can only be done by setting bits in HID4.  We cannot lose control and execute random code in
 234 ;   this state, so we have to disable interrupts as well.  This is an unpleasant hack.
 235
 236 io_space_real_mode_copy:                        ; r0=1, r9=MSR we want to copy with
 237                         sldi    r11,r0,31-MSR_EE_BIT            ; Get a mask for the EE bit
 238                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 239                         andc    r9,r9,r11                                       ; Turn off EE bit
 240                         mfspr   r2,hid4                                         ; Get HID4
 241                         mtmsrd  r9,1                        ; Force off EE
 242                         or              r2,r2,r0                                        ; Set bit to make real accesses cache-inhibited
 243                         sync                                                            ; Sync up
 244                         mtspr   hid4,r2                                         ; Make real accesses cache-inhibited
 245                         isync                                                           ; Toss prefetches
 246
 247                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 248                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 249                         slbie   r12                                                     ; Make sure the ERAT is cleared
 250
 251                         sync
 252                         isync
 253
 254             bl      EXT(bcopy_nc)               ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
 255
 256                         li              r0,1                                            ; Get a 1
 257                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 258                         mfspr   r2,hid4                                         ; Get HID4
 259                         andc    r2,r2,r0                                        ; Clear bit to make real accesses cache-inhibited
 260                         sync                                                            ; Sync up
 261                         mtspr   hid4,r2                                         ; Make real accesses not cache-inhibited
 262                         isync                                                           ; Toss prefetches
 263
 264                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 265                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 266                         slbie   r12                                                     ; Make sure the ERAT is cleared
 267             b       bcopy_phys64x
 268
 269
 270 ;
 271 ; shortcopy
 272 ;
 273 ; Special case short operands (<32 bytes), which are very common.  Note that the check for
 274 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
 275 ; reverse when it wasn't necessary to do so.  This is OK, since performance of the two cases
 276 ; is similar.  We do get the direction right when it counts (ie, when the operands overlap.)
 277 ; Also note that we use the G3/G4 "backend" code, even on G5.  This is OK too, since G5 has
 278 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
 279 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
 280 ; which cost about 20 cycles if they cross a 32-byte boundary on G5.  Finally, because we
 281 ; might do unaligned accesses this code cannot be called from bcopy_nc().
 282 ;           r4 = destination
 283 ;           r5 = length (<32)
 284 ;           r6 = source
 285 ;           r12 = (dest - source)
 286
 287             .align  5
 288 shortcopy:
 289             cmplw   r12,r5                      ; must move reverse if (dest-source)<length
 290             mtcrf   2,r5                        ; move length to cr6 and cr7 one at a time...
 291             mtcrf   1,r5                        ; ...which is faster on G4 and G5
 292             bge++   backend                     ; handle forward moves (most common case)
 293             add     r6,r6,r5                    ; point one past end of operands in reverse moves
 294             add     r4,r4,r5
 295             b       bbackend                    ; handle reverse moves
 296
 297 ;
 298 ; void bcopy(from, to, nbytes)
 299 ;
 300 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
 301 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
 302 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
 303 ; to the thread_recover routine.  What this means is that it would be hard to use vector or floating point
 304 ; registers to accelerate the copy.
 305 ;
 306 ; NOTE: this code can be called in any of three "modes":
 307 ;       - on 32-bit processors (32-byte cache line)
 308 ;       - on 64-bit processors running in 32-bit mode (128-byte cache line)
 309 ;       - on 64-bit processors running in 64-bit mode (128-byte cache line)
 310
 311                         .align  5
 312                         .globl  EXT(bcopy)
 313             .globl  EXT(bcopy_nop_if_32bit)
 314
 315 LEXT(bcopy)
 316                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 317             sub.    r12,r4,r3                                   ; test for to==from in mode-independent way, start fwd/rev check
 318                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 319                         blt     cr1,shortcopy               ; special case short operands
 320                         crclr   noncache                                        ; Set cached
 321 LEXT(bcopy_nop_if_32bit)
 322             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 323                         bne+    copyit32                                        ; handle 32-bit processor
 324             blr                                 ; to==from so nothing to do
 325
 326 ;
 327 ; bcopy_nc(from, to, nbytes)
 328 ;
 329 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
 330 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
 331 ; alignment exceptions.  Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
 332 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
 333
 334                         .align  5
 335                         .globl  EXT(bcopy_nc)
 336             .globl  EXT(bcopy_nc_nop_if_32bit)
 337
 338 LEXT(bcopy_nc)
 339                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
 340             sub.        r12,r4,r3                                       ; test for to==from in mode-independent way, start fwd/rev check
 341                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 342                         crset   noncache                                        ; Set non-cached
 343                         cror    cr0_eq,cr1_eq,cr0_eq        ; set cr0 beq if either length zero or to==from
 344 LEXT(bcopy_nc_nop_if_32bit)
 345             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 346                         bne+    copyit32                                        ; handle 32-bit processor
 347             blr                                 ; either zero length or to==from
 348
 349 ;
 350 ; void* memcpy(to, from, nbytes)
 351 ; void* memmove(to, from, nbytes)
 352 ;
 353 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
 354 ; However, they would work correctly if called in 64-bit mode.
 355
 356                         .align  5
 357                         .globl  EXT(memcpy)
 358                         .globl  EXT(memmove)
 359             .globl  EXT(memcpy_nop_if_32bit)
 360
 361 LEXT(memcpy)
 362 LEXT(memmove)
 363                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 364             sub.    r12,r3,r4                                   ; test for to==from in mode-independent way, start fwd/rev check
 365                         mr              r6,r4                                           ; Set source
 366                         mr              r4,r3                                           ; Set the "to" (must preserve r3 for return value)
 367                         blt     cr1,shortcopy               ; special case short operands
 368                         crclr   noncache                                        ; Set cached
 369 LEXT(memcpy_nop_if_32bit)
 370             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 371                         beqlr-                              ; exit if to==from
 372
 373
 374 ;       Here to copy on 32-bit processors.
 375 ;
 376 ;                       When we move the memory, forward overlays must be handled.  We
 377 ;                       also can not use the cache instructions if we are from bcopy_nc.
 378 ;                       We need to preserve R3 because it needs to be returned for memcpy.
 379 ;                       We can be interrupted and lose control here.
 380 ;
 381 ;           When entered:
 382 ;               r4 = destination
 383 ;               r5 = length (>0)
 384 ;               r6 = source
 385 ;               r12 = (dest - source)
 386 ;               cr5 = noncache flag
 387
 388 copyit32:                                       ; WARNING! can drop down to this label
 389             cmplw   cr1,r12,r5                  ; must move reverse if (dest-source)<length
 390             cntlzw  r11,r5                      ; get magnitude of length
 391             dcbt    0,r6                        ; start to touch in source
 392             lis     r10,hi16(0x80000000)        ; get 0x80000000
 393             neg     r9,r4                       ; start to get alignment for destination
 394             dcbtst  0,r4                        ; start to touch in destination
 395             sraw    r8,r10,r11                  ; get mask based on operand length, to limit alignment
 396             blt-    cr1,reverse32bit            ; reverse move required
 397
 398 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
 399 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 400 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 401 ;               r4 = destination
 402 ;               r5 = length (>0)
 403 ;               r6 = source
 404 ;               r8 = inverse of largest mask smaller than operand length
 405 ;               r9 = neg(dest), used to compute alignment
 406 ;               cr5 = noncache flag
 407
 408 forward32bit:                                   ; enter from 64-bit CPUs with word aligned uncached operands
 409                         rlwinm  r7,r9,0,0x1F                            ; get bytes to 32-byte-align destination
 410                         andc.   r0,r7,r8                                        ; limit to the maximum front end move
 411             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 412                         beq             alline                                          ; Already on a line...
 413
 414                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 415                         sub             r5,r5,r0                                        ; Set the length left to move
 416
 417                         bf              31,alhalf                                       ; No single byte to do...
 418                         lbz             r7,0(r6)                                        ; Get the byte
 419                         addi    r6,r6,1                                         ; Point to the next
 420                         stb             r7,0(r4)                                        ; Save the single
 421                         addi    r4,r4,1                                         ; Bump sink
 422
 423 ;                       Sink is halfword aligned here
 424
 425 alhalf:         bf              30,alword                                       ; No halfword to do...
 426                         lhz             r7,0(r6)                                        ; Get the halfword
 427                         addi    r6,r6,2                                         ; Point to the next
 428                         sth             r7,0(r4)                                        ; Save the halfword
 429                         addi    r4,r4,2                                         ; Bump sink
 430
 431 ;                       Sink is word aligned here
 432
 433 alword:         bf              29,aldouble                                     ; No word to do...
 434                         lwz             r7,0(r6)                                        ; Get the word
 435                         addi    r6,r6,4                                         ; Point to the next
 436                         stw             r7,0(r4)                                        ; Save the word
 437                         addi    r4,r4,4                                         ; Bump sink
 438
 439 ;                       Sink is double aligned here
 440
 441 aldouble:       bf              28,alquad                                       ; No double to do...
 442                         lwz             r7,0(r6)                                        ; Get the first word
 443                         lwz             r8,4(r6)                                        ; Get the second word
 444                         addi    r6,r6,8                                         ; Point to the next
 445                         stw             r7,0(r4)                                        ; Save the first word
 446                         stw             r8,4(r4)                                        ; Save the second word
 447                         addi    r4,r4,8                                         ; Bump sink
 448
 449 ;                       Sink is quadword aligned here
 450
 451 alquad:         bf              27,alline                                       ; No quad to do...
 452                         lwz             r7,0(r6)                                        ; Get the first word
 453                         lwz             r8,4(r6)                                        ; Get the second word
 454                         lwz             r9,8(r6)                                        ; Get the third word
 455                         stw             r7,0(r4)                                        ; Save the first word
 456                         lwz             r11,12(r6)                                      ; Get the fourth word
 457                         addi    r6,r6,16                                        ; Point to the next
 458                         stw             r8,4(r4)                                        ; Save the second word
 459                         stw             r9,8(r4)                                        ; Save the third word
 460                         stw             r11,12(r4)                                      ; Save the fourth word
 461                         addi    r4,r4,16                                        ; Bump sink
 462
 463 ;                       Sink is line aligned here
 464
 465 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 466             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 467                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 468                         beq-    backend                                         ; No full lines to move
 469
 470             mtctr   r0                          ; set up loop count
 471                         li              r0,96                                           ; Stride for touch ahead
 472             b       nxtline
 473
 474             .align  4
 475 nxtline:
 476             lwz         r2,0(r6)                                        ; Get the first word
 477                         lwz             r5,4(r6)                                        ; Get the second word
 478                         lwz             r7,8(r6)                                        ; Get the third word
 479                         lwz             r8,12(r6)                                       ; Get the fourth word
 480                         lwz             r9,16(r6)                                       ; Get the fifth word
 481                         lwz             r10,20(r6)                                      ; Get the sixth word
 482                         lwz             r11,24(r6)                                      ; Get the seventh word
 483                         lwz             r12,28(r6)                                      ; Get the eighth word
 484                         bt-             noncache,skipz                          ; Skip if we are not cached...
 485                         dcbz    0,r4                                            ; Blow away the whole line because we are replacing it
 486                         dcbt    r6,r0                                           ; Touch ahead a bit
 487 skipz:
 488                         addi    r6,r6,32                                        ; Point to the next
 489                         stw             r2,0(r4)                                        ; Save the first word
 490                         stw             r5,4(r4)                                        ; Save the second word
 491                         stw             r7,8(r4)                                        ; Save the third word
 492                         stw             r8,12(r4)                                       ; Save the fourth word
 493                         stw             r9,16(r4)                                       ; Save the fifth word
 494                         stw             r10,20(r4)                                      ; Save the sixth word
 495                         stw             r11,24(r4)                                      ; Save the seventh word
 496                         stw             r12,28(r4)                                      ; Save the eighth word
 497                         addi    r4,r4,32                                        ; Bump sink
 498                         bdnz+   nxtline                                         ; Do the next line, if any...
 499
 500
 501 ;                       Move backend quadword
 502
 503 backend:                                        ; Join here from "shortcopy" for forward moves <32 bytes
 504             bf          27,noquad                                       ; No quad to do...
 505                         lwz             r7,0(r6)                                        ; Get the first word
 506                         lwz             r8,4(r6)                                        ; Get the second word
 507                         lwz             r9,8(r6)                                        ; Get the third word
 508                         lwz             r11,12(r6)                                      ; Get the fourth word
 509                         stw             r7,0(r4)                                        ; Save the first word
 510                         addi    r6,r6,16                                        ; Point to the next
 511                         stw             r8,4(r4)                                        ; Save the second word
 512                         stw             r9,8(r4)                                        ; Save the third word
 513                         stw             r11,12(r4)                                      ; Save the fourth word
 514                         addi    r4,r4,16                                        ; Bump sink
 515
 516 ;                       Move backend double
 517
 518 noquad:         bf              28,nodouble                                     ; No double to do...
 519                         lwz             r7,0(r6)                                        ; Get the first word
 520                         lwz             r8,4(r6)                                        ; Get the second word
 521                         addi    r6,r6,8                                         ; Point to the next
 522                         stw             r7,0(r4)                                        ; Save the first word
 523                         stw             r8,4(r4)                                        ; Save the second word
 524                         addi    r4,r4,8                                         ; Bump sink
 525
 526 ;                       Move backend word
 527
 528 nodouble:       bf              29,noword                                       ; No word to do...
 529                         lwz             r7,0(r6)                                        ; Get the word
 530                         addi    r6,r6,4                                         ; Point to the next
 531                         stw             r7,0(r4)                                        ; Save the word
 532                         addi    r4,r4,4                                         ; Bump sink
 533
 534 ;                       Move backend halfword
 535
 536 noword:         bf              30,nohalf                                       ; No halfword to do...
 537                         lhz             r7,0(r6)                                        ; Get the halfword
 538                         addi    r6,r6,2                                         ; Point to the next
 539                         sth             r7,0(r4)                                        ; Save the halfword
 540                         addi    r4,r4,2                                         ; Bump sink
 541
 542 ;                       Move backend byte
 543
 544 nohalf:         bflr    31                          ; Leave cuz we are all done...
 545                         lbz             r7,0(r6)                                        ; Get the byte
 546                         stb             r7,0(r4)                                        ; Save the single
 547             blr
 548
 549
 550 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
 551 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 552 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 553 ; These are slower because we don't bother with dcbz.  Fortunately, reverse moves are uncommon.
 554 ;               r4 = destination
 555 ;               r5 = length (>0)
 556 ;               r6 = source
 557 ;               r8 = inverse of largest mask smaller than operand length
 558 ;               cr5 = noncache flag (but we don't dcbz anyway)
 559
 560 reverse32bit:                                                                   ; here from 64-bit code with word aligned uncached operands
 561             add         r4,r5,r4                                        ; Point past the last sink byte
 562                         add             r6,r5,r6                                        ; Point past the last source byte
 563                         rlwinm  r7,r4,0,0x1F                            ; Calculate the length to align dest on cache boundary
 564                         li              r12,-1                                          ; Make sure we touch in the actual line
 565                         andc.   r0,r7,r8                                        ; Apply movement limit
 566                         dcbt    r12,r6                                          ; Touch in the last line of source
 567             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 568                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
 569                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 570                         beq-    balline                                         ; Aready on cache line boundary (or too short to bother)
 571
 572                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
 573
 574                         bf              31,balhalf                                      ; No single byte to do...
 575                         lbz             r7,-1(r6)                                       ; Get the byte
 576                         subi    r6,r6,1                                         ; Point to the next
 577                         stb             r7,-1(r4)                                       ; Save the single
 578                         subi    r4,r4,1                                         ; Bump sink
 579
 580 ;                       Sink is halfword aligned here
 581
 582 balhalf:        bf              30,balword                                      ; No halfword to do...
 583                         lhz             r7,-2(r6)                                       ; Get the halfword
 584                         subi    r6,r6,2                                         ; Point to the next
 585                         sth             r7,-2(r4)                                       ; Save the halfword
 586                         subi    r4,r4,2                                         ; Bump sink
 587
 588 ;                       Sink is word aligned here
 589
 590 balword:        bf              29,baldouble                            ; No word to do...
 591                         lwz             r7,-4(r6)                                       ; Get the word
 592                         subi    r6,r6,4                                         ; Point to the next
 593                         stw             r7,-4(r4)                                       ; Save the word
 594                         subi    r4,r4,4                                         ; Bump sink
 595
 596 ;                       Sink is double aligned here
 597
 598 baldouble:      bf              28,balquad                                      ; No double to do...
 599                         lwz             r7,-8(r6)                                       ; Get the first word
 600                         lwz             r8,-4(r6)                                       ; Get the second word
 601                         subi    r6,r6,8                                         ; Point to the next
 602                         stw             r7,-8(r4)                                       ; Save the first word
 603                         stw             r8,-4(r4)                                       ; Save the second word
 604                         subi    r4,r4,8                                         ; Bump sink
 605
 606 ;                       Sink is quadword aligned here
 607
 608 balquad:        bf              27,balline                                      ; No quad to do...
 609                         lwz             r7,-16(r6)                                      ; Get the first word
 610                         lwz             r8,-12(r6)                                      ; Get the second word
 611                         lwz             r9,-8(r6)                                       ; Get the third word
 612                         lwz             r11,-4(r6)                                      ; Get the fourth word
 613                         stw             r7,-16(r4)                                      ; Save the first word
 614                         subi    r6,r6,16                                        ; Point to the next
 615                         stw             r8,-12(r4)                                      ; Save the second word
 616                         stw             r9,-8(r4)                                       ; Save the third word
 617                         stw             r11,-4(r4)                                      ; Save the fourth word
 618                         subi    r4,r4,16                                        ; Bump sink
 619
 620 ;                       Sink is line aligned here
 621
 622 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 623             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 624                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 625                         beq-    bbackend                                        ; No full lines to move
 626             mtctr   r0                          ; set up loop count
 627             b       bnxtline
 628
 629             .align  4
 630 bnxtline:
 631                         lwz             r7,-32(r6)                                      ; Get the first word
 632                         lwz             r5,-28(r6)                                      ; Get the second word
 633                         lwz             r2,-24(r6)                                      ; Get the third word
 634                         lwz             r12,-20(r6)                                     ; Get the third word
 635                         lwz             r11,-16(r6)                                     ; Get the fifth word
 636                         lwz             r10,-12(r6)                                     ; Get the sixth word
 637                         lwz             r9,-8(r6)                                       ; Get the seventh word
 638                         lwz             r8,-4(r6)                                       ; Get the eighth word
 639                         subi    r6,r6,32                                        ; Point to the next
 640
 641                         stw             r7,-32(r4)                                      ; Get the first word
 642             stw         r5,-28(r4)                                      ; Get the second word
 643                         stw             r2,-24(r4)                                      ; Get the third word
 644                         stw             r12,-20(r4)                                     ; Get the third word
 645                         stw             r11,-16(r4)                                     ; Get the fifth word
 646                         stw             r10,-12(r4)                                     ; Get the sixth word
 647                         stw             r9,-8(r4)                                       ; Get the seventh word
 648                         stw             r8,-4(r4)                                       ; Get the eighth word
 649                         subi    r4,r4,32                                        ; Bump sink
 650
 651                         bdnz+   bnxtline                                        ; Do the next line, if any...
 652
 653 ;
 654 ;                       Note: We touched these lines in at the beginning
 655 ;
 656
 657 ;                       Move backend quadword
 658
 659 bbackend:                                       ; Join here from "shortcopy" for reverse moves of <32 bytes
 660             bf          27,bnoquad                                      ; No quad to do...
 661                         lwz             r7,-16(r6)                                      ; Get the first word
 662                         lwz             r8,-12(r6)                                      ; Get the second word
 663                         lwz             r9,-8(r6)                                       ; Get the third word
 664                         lwz             r11,-4(r6)                                      ; Get the fourth word
 665                         stw             r7,-16(r4)                                      ; Save the first word
 666                         subi    r6,r6,16                                        ; Point to the next
 667                         stw             r8,-12(r4)                                      ; Save the second word
 668                         stw             r9,-8(r4)                                       ; Save the third word
 669                         stw             r11,-4(r4)                                      ; Save the fourth word
 670                         subi    r4,r4,16                                        ; Bump sink
 671
 672 ;                       Move backend double
 673
 674 bnoquad:        bf              28,bnodouble                            ; No double to do...
 675                         lwz             r7,-8(r6)                                       ; Get the first word
 676                         lwz             r8,-4(r6)                                       ; Get the second word
 677                         subi    r6,r6,8                                         ; Point to the next
 678                         stw             r7,-8(r4)                                       ; Save the first word
 679                         stw             r8,-4(r4)                                       ; Save the second word
 680                         subi    r4,r4,8                                         ; Bump sink
 681
 682 ;                       Move backend word
 683
 684 bnodouble:      bf              29,bnoword                                      ; No word to do...
 685                         lwz             r7,-4(r6)                                       ; Get the word
 686                         subi    r6,r6,4                                         ; Point to the next
 687                         stw             r7,-4(r4)                                       ; Save the word
 688                         subi    r4,r4,4                                         ; Bump sink
 689
 690 ;                       Move backend halfword
 691
 692 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
 693                         lhz             r7,-2(r6)                                       ; Get the halfword
 694                         subi    r6,r6,2                                         ; Point to the next
 695                         sth             r7,-2(r4)                                       ; Save the halfword
 696                         subi    r4,r4,2                                         ; Bump sink
 697
 698 ;                       Move backend byte
 699
 700 bnohalf:        bflr    31                          ; Leave cuz we are all done...
 701                         lbz             r7,-1(r6)                                       ; Get the byte
 702                         stb             r7,-1(r4)                                       ; Save the single
 703                         blr
 704
 705
 706 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
 707 // called either in 32 or 64-bit mode, which makes the test for reverse moves
 708 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
 709 // special cases.
 710 //
 711 // When entered:
 712 //              r4 = destination (32 or 64-bit ptr)
 713 //              r5 = length (always 32 bits)
 714 //              r6 = source (32 or 64-bit ptr)
 715 //      r12 = (dest - source), reverse move required if (dest-source)<length
 716 //              cr5 = noncache flag
 717
 718         .align  5
 719 copyit64:
 720         rlwinm  r7,r5,0,0,31        // truncate length to 32-bit, in case we're running in 64-bit mode
 721         cntlzw  r11,r5                          // get magnitude of length
 722         dcbt    0,r6                            // touch in 1st block of source
 723         dcbtst  0,r4                            // touch in 1st destination cache block
 724         subc    r7,r12,r7           // set Carry if (dest-source)>=length, in mode-independent way
 725         li      r0,0                // get a 0
 726         lis     r10,hi16(0x80000000)// get 0x80000000
 727         addze.  r0,r0               // set cr0 on carry bit (beq if reverse move required)
 728         neg     r9,r4               // start to get alignment for destination
 729         sraw    r8,r10,r11          // get mask based on operand length, to limit alignment
 730         bt--    noncache,c64uncached// skip if uncached
 731         beq--   c64rdouble          // handle cached reverse moves
 732
 733
 734 // Forward, cached or doubleword aligned uncached.  This is the common case.
 735 // NOTE: we never do an unaligned access if the source and destination are "relatively"
 736 // doubleword aligned.  We depend on this in the uncached case.
 737 //      r4 = destination
 738 //      r5 = length (>0)
 739 //      r6 = source
 740 //      r8 = inverse of largest mask smaller than operand length
 741 //      r9 = neg(dest), used to compute alignment
 742 //      cr5 = noncache flag
 743
 744 c64double:
 745         rlwinm  r7,r9,0,0x7F        // get #bytes to 128-byte align destination
 746         andc    r7,r7,r8            // limit by operand length
 747         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
 748         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
 749         sub             r5,r5,r7                        // adjust length remaining
 750         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
 751         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
 752         cmpwi   cr7,r10,0                       // set cr7 on chunk count
 753         beq             c64double2                      // dest already doubleword aligned
 754         mtctr   r8
 755         b               c64double1
 756
 757         .align  5                                       // align inner loops
 758 c64double1:                                                     // copy bytes until dest is doubleword aligned
 759         lbz             r0,0(r6)
 760         addi    r6,r6,1
 761         stb             r0,0(r4)
 762         addi    r4,r4,1
 763         bdnz    c64double1
 764
 765 c64double2:                                                     // r9/cr1=doublewords, r10/cr7=128-byte chunks
 766         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
 767         mtctr   r9
 768         b               c64double3
 769
 770         .align  5                                       // align inner loops
 771 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
 772         ld              r7,0(r6)
 773         addi    r6,r6,8
 774         std             r7,0(r4)
 775         addi    r4,r4,8
 776         bdnz    c64double3
 777
 778 // Here to xfer 128-byte chunks, if any.  Since we only have 8 GPRs for
 779 // data (64 bytes), we load/store each twice per 128-byte chunk.
 780
 781 c64double4:                                                     // r10/cr7=128-byte chunks
 782         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
 783         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
 784         beq             cr7,c64double7          // no 128-byte chunks
 785
 786         ; We must check for (source-dest)<128 in a mode-independent way.  If within 128 bytes,
 787         ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
 788
 789         sub             r8,r6,r4                        // r8 <- (source - dest)
 790         rldicr. r0,r8,0,63-7        // zero low 7 bits and check for 0, mode independent
 791         cror    noncache,cr0_eq,noncache        // turn on "noncache" flag if (source-dest)<128
 792         mtctr   r10
 793         b               c64InnerLoop
 794
 795         .align  5                                       // align inner loop
 796 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
 797         ld              r0,0(r6)                        // start pipe: load 1st half-line
 798         ld              r2,8(r6)
 799         ld              r7,16(r6)
 800         ld              r8,24(r6)
 801         ld              r9,32(r6)
 802         ld              r10,40(r6)
 803         ld              r11,48(r6)
 804         ld              r12,56(r6)
 805         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
 806         dcbz128 0,r4                            // avoid prefetch of next cache line
 807 c64InnerLoop1:
 808
 809         std             r0,0(r4)
 810         std             r2,8(r4)
 811         std             r7,16(r4)
 812         std             r8,24(r4)
 813         std             r9,32(r4)
 814         std             r10,40(r4)
 815         std             r11,48(r4)
 816         std             r12,56(r4)
 817
 818         ld              r0,64(r6)                       // load 2nd half of chunk
 819         ld              r2,72(r6)
 820         ld              r7,80(r6)
 821         ld              r8,88(r6)
 822         ld              r9,96(r6)
 823         ld              r10,104(r6)
 824         ld              r11,112(r6)
 825         ld              r12,120(r6)
 826         addi    r6,r6,128
 827
 828         std             r0,64(r4)
 829         std             r2,72(r4)
 830         std             r7,80(r4)
 831         std             r8,88(r4)
 832         std             r9,96(r4)
 833         std             r10,104(r4)
 834         std             r11,112(r4)
 835         std             r12,120(r4)
 836         addi    r4,r4,128                       // advance to next dest chunk
 837
 838         bdnz    c64InnerLoop            // loop if more chunks
 839
 840
 841 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
 842         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
 843         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
 844         beq             cr1,c64byte                     // no leftover doublewords
 845         mtctr   r0
 846         b               c64double8
 847
 848         .align  5                                       // align inner loop
 849 c64double8:                                                     // loop copying leftover doublewords
 850         ld              r0,0(r6)
 851         addi    r6,r6,8
 852         std             r0,0(r4)
 853         addi    r4,r4,8
 854         bdnz    c64double8
 855
 856
 857 // Forward byte loop.
 858
 859 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
 860                 beqlr                       // done if no leftover bytes
 861         mtctr   r5
 862         b               c64byte1
 863
 864         .align  5                                       // align inner loop
 865 c64byte1:
 866         lbz             r0,0(r6)
 867         addi    r6,r6,1
 868         stb             r0,0(r4)
 869         addi    r4,r4,1
 870         bdnz    c64byte1
 871
 872         blr
 873
 874
 875 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
 876 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
 877 // a byte at a time, but that is still much faster than alignment exceptions.
 878 //      r4 = destination
 879 //      r5 = length (>0)
 880 //      r6 = source
 881 //      r8 = inverse of largest mask smaller than operand length
 882 //      r9 = neg(dest), used to compute alignment
 883 //      r12 = (dest-source), used to test relative alignment
 884 //      cr0 = beq if reverse move required
 885 //      cr5 = noncache flag
 886
 887 c64uncached:
 888         rlwinm  r10,r12,0,29,31         // relatively doubleword aligned?
 889         rlwinm  r11,r12,0,30,31         // relatively word aligned?
 890         cmpwi   cr7,r10,0                       // set cr7 beq if doubleword aligned
 891         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
 892         beq--   c64reverseUncached
 893
 894         beq             cr7,c64double           // doubleword aligned
 895         beq             cr1,forward32bit    // word aligned, use G3/G4 code
 896         cmpwi   r5,0                            // set cr0 on byte count
 897         b               c64byte                         // unaligned operands
 898
 899 c64reverseUncached:
 900         beq             cr7,c64rdouble          // doubleword aligned so can use LD/STD
 901         beq             cr1,reverse32bit        // word aligned, use G3/G4 code
 902         add             r6,r6,r5                        // point to (end+1) of source and dest
 903         add             r4,r4,r5
 904         cmpwi   r5,0                            // set cr0 on length
 905         b               c64rbyte                        // copy a byte at a time
 906
 907
 908
 909 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
 910 // aligned uncached copies.
 911 //      r4 = destination
 912 //      r5 = length (>0)
 913 //      r6 = source
 914 //      r8 = inverse of largest mask of low-order 1s smaller than operand length
 915 //      cr5 = noncache flag
 916
 917 c64rdouble:
 918         add             r6,r6,r5                        // point to (end+1) of source and dest
 919         add             r4,r4,r5
 920         rlwinm  r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
 921         andc.   r7,r7,r8            // limit by operand length
 922         sub             r5,r5,r7                        // adjust length
 923         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
 924         cmpwi   cr1,r8,0                        // any chunks?
 925         beq             c64rd2                          // source already doubleword aligned
 926         mtctr   r7
 927
 928 c64rd1:                                                         // copy bytes until source doublword aligned
 929         lbzu    r0,-1(r6)
 930         stbu    r0,-1(r4)
 931         bdnz    c64rd1
 932
 933 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
 934         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
 935         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
 936         cmpwi   cr7,r0,0                        // leftover doublewords?
 937         beq             cr1,c64rd4                      // no chunks to xfer
 938         mtctr   r8
 939         b               c64rd3
 940
 941         .align  5                                       // align inner loop
 942 c64rd3:                                                         // loop copying 64-byte chunks
 943         ld              r7,-8(r6)
 944         ld              r8,-16(r6)
 945         ld              r9,-24(r6)
 946         ld              r10,-32(r6)
 947         ld              r11,-40(r6)
 948         ld              r12,-48(r6)
 949         std             r7,-8(r4)
 950         std             r8,-16(r4)
 951         ld              r7,-56(r6)
 952         ldu             r8,-64(r6)
 953         std             r9,-24(r4)
 954         std             r10,-32(r4)
 955         std             r11,-40(r4)
 956         std             r12,-48(r4)
 957         std             r7,-56(r4)
 958         stdu    r8,-64(r4)
 959         bdnz    c64rd3
 960
 961 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
 962         beq             cr7,c64rbyte            // no leftover doublewords
 963         mtctr   r0
 964
 965 c64rd5:                                                         // loop copying leftover doublewords
 966         ldu             r0,-8(r6)
 967         stdu    r0,-8(r4)
 968         bdnz    c64rd5
 969
 970
 971 // Reverse byte loop.
 972
 973 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
 974         beqlr                       // done if no leftover bytes
 975         mtctr   r5
 976
 977 c64rbyte1:
 978         lbzu    r0,-1(r6)
 979         stbu    r0,-1(r4)
 980         bdnz    c64rbyte1
 981
 982         blr
 983