osfmk/ppc/bcopy.s

   1 /*
   2  * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 ;
  29 ;                       Copy bytes of data around. Handles overlapped data.
  30 ;
  31 ;
  32 #include <ppc/asm.h>
  33 #include <ppc/proc_reg.h>
  34 #include <assym.s>
  35
  36 ;       These routines use CR5 for certain flags:
  37 ;               Use CR5_lt to indicate non-cached (in bcopy and memcpy)
  38 #define noncache 20
  39
  40
  41 ;       The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
  42 #define BCOPY_SF_SIZE   32      // total size
  43 #define BCOPY_SF_MSR    16      // we save caller's MSR here (possibly minus VEC and FP)
  44
  45
  46 #define kShort  32              // short operands are special cased
  47
  48
  49 ; void bcopy_physvir_32(from, to, nbytes)
  50 ;
  51 ; Attempt to copy physically addressed memory with translation on if conditions are met.
  52 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors
  53 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
  54 ; for the passed phys addrs and do the copy with translation on.
  55 ;
  56 ; Rules are: - neither source nor destination can cross a page.
  57 ;            - Interrupts must be disabled when this routine is called.
  58 ;            - Translation must be on when called.
  59 ;
  60 ; To do the copy, we build a 128 DBAT for both the source and sink.  If both are the same, only one
  61 ; is loaded.  We do not touch the IBATs, so there is no issue if either physical page
  62 ; address is the same as the virtual address of the instructions we are executing.
  63 ;
  64 ; At the end, we invalidate the used DBATs.
  65 ;
  66 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  67 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  68 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  69 ; there anyhow.
  70 ;
  71 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
  72 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
  73 ; for an example of how this is done.
  74
  75                         .align  5
  76                         .globl  EXT(bcopy_physvir_32)
  77
  78 LEXT(bcopy_physvir_32)
  79             mflr    r0                          ; get return address
  80             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  81             mfsprg      r8,2                                            ; get processor feature flags
  82             stw     r0,8(r1)                    ; save return address
  83                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  84             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
  85             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  86             subi    r0,r7,1                     ; get length - 1
  87                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  88                         add             r11,r3,r0                                       ; Point to last byte of sink
  89                         mr              r5,r7                                           ; Get the length into the right register
  90             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  91
  92 ; This test for page overflow may not work if the length is negative.  Negative lengths are invalid input
  93 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
  94
  95                         add             r12,r4,r0                                       ; Point to last byte of source
  96                         xor             r7,r11,r3                                       ; See if we went to next page
  97                         xor             r8,r12,r4                                       ; See if we went to next page
  98                         or              r0,r7,r8                                        ; Combine wrap
  99
 100 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
 101                         li              r9,((2<<3)|2)                           ; Set default attributes
 102                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
 103                         li              r7,2                                            ; Set validity flags
 104                         li              r8,2                                            ; Set validity flags
 105                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
 106
 107                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
 108                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
 109                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
 110                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
 111                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
 112
 113                         sync
 114
 115                         mtdbatl 0,r11                                           ; Set sink lower DBAT
 116                         mtdbatu 0,r7                                            ; Set sink upper DBAT
 117
 118                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
 119
 120                         mtdbatl 1,r12                                           ; Set source lower DBAT
 121                         mtdbatu 1,r8                                            ; Set source upper DBAT
 122
 123 bcpvsame:
 124             sync                                ; wait for the BATs to stabilize
 125             isync
 126
 127             bl      EXT(bcopy)                  ; BATs set up, args in r3-r5, so do the copy with DR on
 128
 129             li          r0,0                                            ; Get set to invalidate upper half of BATs
 130                         sync                                                            ; Make sure all is well
 131                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
 132                         mtdbatu 1,r0                                            ; Clear source upper DBAT
 133                         sync
 134                         isync
 135
 136             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address
 137             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 138             mtlr    r0
 139             blr
 140
 141
 142 ; void bcopy_phys(from, to, nbytes)
 143 ;
 144 ; Turns off data translation before the copy.  This one will not work in user state.
 145 ; This routine is used on 32 and 64-bit machines.
 146 ;
 147 ; Note that the address parameters are long longs.  We will transform these to 64-bit
 148 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
 149 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
 150 ; there anyhow.
 151 ;
 152 ; Also note that you probably will not be happy if either the sink or source spans across the
 153 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code
 154 ; will not check, so be careful.
 155 ;
 156 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
 157 ;       Interrupts may or may not be disabled.
 158
 159                         .align  5
 160                         .globl  EXT(bcopy_phys)
 161
 162 LEXT(bcopy_phys)
 163             mflr    r0                          ; get return address
 164             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 165             stw     r0,8(r1)                    ; save
 166             mfsprg      r8,2                                            ; get processor feature flags
 167             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
 168                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
 169                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 170                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
 171                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
 172                         mr              r5,r7                                           ; Get the length into the right register
 173
 174 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
 175                         mfmsr   r9                                                      ; Get the MSR
 176                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
 177             ori     r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))       ; Add in FP and DR
 178             andc    r9,r9,r6                    ; unconditionally turn DR, VEC, and FP off
 179             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
 180
 181 ; 32-bit CPUs
 182
 183                         mtmsr   r9                                                      ; turn DR, FP, and VEC off
 184                         isync                                                           ; Wait for it
 185
 186             bl      EXT(bcopy)                  ; do the copy with translation off and caching on
 187
 188                         mfmsr   r9                                                      ; Get the MSR
 189             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on (but leave VEC and FP off)
 190             mtmsr   r9                          ; restore msr
 191             isync                               ; wait for it to happen
 192             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 193             mtlr    r0
 194             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 195             blr
 196
 197
 198 ; 64-bit: turn DR off and SF on.
 199
 200 bcopy_phys64:                                                                   ; r9 = MSR with DP, VEC, and FP off
 201             ori     r8,r9,lo16(MASK(MSR_DR))    ; make a copy with DR back on... this is what we return to caller
 202                         srdi    r2,r3,31                                        ; Get a 1 if source is in I/O memory
 203             li          r0,1                                            ; Note - we use this in a couple places below
 204                         srdi    r10,r4,31                                       ; Get a 1 if sink is in I/O memory
 205             std     r8,BCOPY_SF_MSR(r1)         ; save caller's MSR so we remember whether EE was on
 206             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on in MSR we will copy with
 207                         cmpldi  cr0,r2,1                                        ; Is source in I/O memory?
 208                         cmpldi  cr7,r10,1                                       ; Is sink in I/O memory?
 209             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
 210             isync                                                               ; wait for it to happen
 211                         cror    cr7_eq,cr0_eq,cr7_eq            ; See if either source or sink is in I/O area
 212             beq--   cr7,io_space_real_mode_copy ; an operand is in I/O space
 213
 214             bl      EXT(bcopy)                  ; do copy with DR off and SF on, cache enabled
 215
 216 bcopy_phys64x:
 217                         mfmsr   r9                                                      ; Get the MSR we used to copy
 218             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
 219             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on
 220             mtmsrd  r9                          ; turn 64-bit mode off, translation back on
 221             isync                                                               ; wait for it to happen
 222             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 223             ld      r8,BCOPY_SF_MSR(r1)         ; get caller's MSR once translation is back on
 224             mtlr    r0
 225             mtmsrd  r8,1                        ; turn EE back on if necessary
 226             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 227             blr
 228
 229 ;   We need to copy with DR off, but one of the operands is in I/O space.  To avoid wedging U3,
 230 ;   which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
 231 ;   This can only be done by setting bits in HID4.  We cannot lose control and execute random code in
 232 ;   this state, so we have to disable interrupts as well.  This is an unpleasant hack.
 233
 234 io_space_real_mode_copy:                        ; r0=1, r9=MSR we want to copy with
 235                         sldi    r11,r0,31-MSR_EE_BIT            ; Get a mask for the EE bit
 236                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 237                         andc    r9,r9,r11                                       ; Turn off EE bit
 238                         mfspr   r2,hid4                                         ; Get HID4
 239                         mtmsrd  r9,1                        ; Force off EE
 240                         or              r2,r2,r0                                        ; Set bit to make real accesses cache-inhibited
 241                         sync                                                            ; Sync up
 242                         mtspr   hid4,r2                                         ; Make real accesses cache-inhibited
 243                         isync                                                           ; Toss prefetches
 244
 245                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 246                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 247                         slbie   r12                                                     ; Make sure the ERAT is cleared
 248
 249                         sync
 250                         isync
 251
 252             bl      EXT(bcopy_nc)               ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
 253
 254                         li              r0,1                                            ; Get a 1
 255                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 256                         mfspr   r2,hid4                                         ; Get HID4
 257                         andc    r2,r2,r0                                        ; Clear bit to make real accesses cache-inhibited
 258                         sync                                                            ; Sync up
 259                         mtspr   hid4,r2                                         ; Make real accesses not cache-inhibited
 260                         isync                                                           ; Toss prefetches
 261
 262                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 263                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 264                         slbie   r12                                                     ; Make sure the ERAT is cleared
 265             b       bcopy_phys64x
 266
 267
 268 ;
 269 ; shortcopy
 270 ;
 271 ; Special case short operands (<32 bytes), which are very common.  Note that the check for
 272 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
 273 ; reverse when it wasn't necessary to do so.  This is OK, since performance of the two cases
 274 ; is similar.  We do get the direction right when it counts (ie, when the operands overlap.)
 275 ; Also note that we use the G3/G4 "backend" code, even on G5.  This is OK too, since G5 has
 276 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
 277 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
 278 ; which cost about 20 cycles if they cross a 32-byte boundary on G5.  Finally, because we
 279 ; might do unaligned accesses this code cannot be called from bcopy_nc().
 280 ;           r4 = destination
 281 ;           r5 = length (<32)
 282 ;           r6 = source
 283 ;           r12 = (dest - source)
 284
 285             .align  5
 286 shortcopy:
 287             cmplw   r12,r5                      ; must move reverse if (dest-source)<length
 288             mtcrf   2,r5                        ; move length to cr6 and cr7 one at a time...
 289             mtcrf   1,r5                        ; ...which is faster on G4 and G5
 290             bge++   backend                     ; handle forward moves (most common case)
 291             add     r6,r6,r5                    ; point one past end of operands in reverse moves
 292             add     r4,r4,r5
 293             b       bbackend                    ; handle reverse moves
 294
 295 ;
 296 ; void bcopy(from, to, nbytes)
 297 ;
 298 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
 299 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
 300 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
 301 ; to the thread_recover routine.  What this means is that it would be hard to use vector or floating point
 302 ; registers to accelerate the copy.
 303 ;
 304 ; NOTE: this code can be called in any of three "modes":
 305 ;       - on 32-bit processors (32-byte cache line)
 306 ;       - on 64-bit processors running in 32-bit mode (128-byte cache line)
 307 ;       - on 64-bit processors running in 64-bit mode (128-byte cache line)
 308
 309                         .align  5
 310                         .globl  EXT(bcopy)
 311             .globl  EXT(bcopy_nop_if_32bit)
 312
 313 LEXT(bcopy)
 314                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 315             sub.    r12,r4,r3                                   ; test for to==from in mode-independent way, start fwd/rev check
 316                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 317                         blt     cr1,shortcopy               ; special case short operands
 318                         crclr   noncache                                        ; Set cached
 319 LEXT(bcopy_nop_if_32bit)
 320             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 321                         bne+    copyit32                                        ; handle 32-bit processor
 322             blr                                 ; to==from so nothing to do
 323
 324 ;
 325 ; bcopy_nc(from, to, nbytes)
 326 ;
 327 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
 328 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
 329 ; alignment exceptions.  Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
 330 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
 331
 332                         .align  5
 333                         .globl  EXT(bcopy_nc)
 334             .globl  EXT(bcopy_nc_nop_if_32bit)
 335
 336 LEXT(bcopy_nc)
 337                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
 338             sub.        r12,r4,r3                                       ; test for to==from in mode-independent way, start fwd/rev check
 339                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 340                         crset   noncache                                        ; Set non-cached
 341                         cror    cr0_eq,cr1_eq,cr0_eq        ; set cr0 beq if either length zero or to==from
 342 LEXT(bcopy_nc_nop_if_32bit)
 343             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 344                         bne+    copyit32                                        ; handle 32-bit processor
 345             blr                                 ; either zero length or to==from
 346
 347 ;
 348 ; void* memcpy(to, from, nbytes)
 349 ; void* memmove(to, from, nbytes)
 350 ;
 351 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
 352 ; However, they would work correctly if called in 64-bit mode.
 353
 354                         .align  5
 355                         .globl  EXT(memcpy)
 356                         .globl  EXT(memmove)
 357             .globl  EXT(memcpy_nop_if_32bit)
 358
 359 LEXT(memcpy)
 360 LEXT(memmove)
 361                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 362             sub.    r12,r3,r4                                   ; test for to==from in mode-independent way, start fwd/rev check
 363                         mr              r6,r4                                           ; Set source
 364                         mr              r4,r3                                           ; Set the "to" (must preserve r3 for return value)
 365                         blt     cr1,shortcopy               ; special case short operands
 366                         crclr   noncache                                        ; Set cached
 367 LEXT(memcpy_nop_if_32bit)
 368             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 369                         beqlr-                              ; exit if to==from
 370
 371
 372 ;       Here to copy on 32-bit processors.
 373 ;
 374 ;                       When we move the memory, forward overlays must be handled.  We
 375 ;                       also can not use the cache instructions if we are from bcopy_nc.
 376 ;                       We need to preserve R3 because it needs to be returned for memcpy.
 377 ;                       We can be interrupted and lose control here.
 378 ;
 379 ;           When entered:
 380 ;               r4 = destination
 381 ;               r5 = length (>0)
 382 ;               r6 = source
 383 ;               r12 = (dest - source)
 384 ;               cr5 = noncache flag
 385
 386 copyit32:                                       ; WARNING! can drop down to this label
 387             cmplw   cr1,r12,r5                  ; must move reverse if (dest-source)<length
 388             cntlzw  r11,r5                      ; get magnitude of length
 389             dcbt    0,r6                        ; start to touch in source
 390             lis     r10,hi16(0x80000000)        ; get 0x80000000
 391             neg     r9,r4                       ; start to get alignment for destination
 392             dcbtst  0,r4                        ; start to touch in destination
 393             sraw    r8,r10,r11                  ; get mask based on operand length, to limit alignment
 394             blt-    cr1,reverse32bit            ; reverse move required
 395
 396 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
 397 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 398 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 399 ;               r4 = destination
 400 ;               r5 = length (>0)
 401 ;               r6 = source
 402 ;               r8 = inverse of largest mask smaller than operand length
 403 ;               r9 = neg(dest), used to compute alignment
 404 ;               cr5 = noncache flag
 405
 406 forward32bit:                                   ; enter from 64-bit CPUs with word aligned uncached operands
 407                         rlwinm  r7,r9,0,0x1F                            ; get bytes to 32-byte-align destination
 408                         andc.   r0,r7,r8                                        ; limit to the maximum front end move
 409             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 410                         beq             alline                                          ; Already on a line...
 411
 412                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 413                         sub             r5,r5,r0                                        ; Set the length left to move
 414
 415                         bf              31,alhalf                                       ; No single byte to do...
 416                         lbz             r7,0(r6)                                        ; Get the byte
 417                         addi    r6,r6,1                                         ; Point to the next
 418                         stb             r7,0(r4)                                        ; Save the single
 419                         addi    r4,r4,1                                         ; Bump sink
 420
 421 ;                       Sink is halfword aligned here
 422
 423 alhalf:         bf              30,alword                                       ; No halfword to do...
 424                         lhz             r7,0(r6)                                        ; Get the halfword
 425                         addi    r6,r6,2                                         ; Point to the next
 426                         sth             r7,0(r4)                                        ; Save the halfword
 427                         addi    r4,r4,2                                         ; Bump sink
 428
 429 ;                       Sink is word aligned here
 430
 431 alword:         bf              29,aldouble                                     ; No word to do...
 432                         lwz             r7,0(r6)                                        ; Get the word
 433                         addi    r6,r6,4                                         ; Point to the next
 434                         stw             r7,0(r4)                                        ; Save the word
 435                         addi    r4,r4,4                                         ; Bump sink
 436
 437 ;                       Sink is double aligned here
 438
 439 aldouble:       bf              28,alquad                                       ; No double to do...
 440                         lwz             r7,0(r6)                                        ; Get the first word
 441                         lwz             r8,4(r6)                                        ; Get the second word
 442                         addi    r6,r6,8                                         ; Point to the next
 443                         stw             r7,0(r4)                                        ; Save the first word
 444                         stw             r8,4(r4)                                        ; Save the second word
 445                         addi    r4,r4,8                                         ; Bump sink
 446
 447 ;                       Sink is quadword aligned here
 448
 449 alquad:         bf              27,alline                                       ; No quad to do...
 450                         lwz             r7,0(r6)                                        ; Get the first word
 451                         lwz             r8,4(r6)                                        ; Get the second word
 452                         lwz             r9,8(r6)                                        ; Get the third word
 453                         stw             r7,0(r4)                                        ; Save the first word
 454                         lwz             r11,12(r6)                                      ; Get the fourth word
 455                         addi    r6,r6,16                                        ; Point to the next
 456                         stw             r8,4(r4)                                        ; Save the second word
 457                         stw             r9,8(r4)                                        ; Save the third word
 458                         stw             r11,12(r4)                                      ; Save the fourth word
 459                         addi    r4,r4,16                                        ; Bump sink
 460
 461 ;                       Sink is line aligned here
 462
 463 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 464             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 465                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 466                         beq-    backend                                         ; No full lines to move
 467
 468             mtctr   r0                          ; set up loop count
 469                         li              r0,96                                           ; Stride for touch ahead
 470             b       nxtline
 471
 472             .align  4
 473 nxtline:
 474             lwz         r2,0(r6)                                        ; Get the first word
 475                         lwz             r5,4(r6)                                        ; Get the second word
 476                         lwz             r7,8(r6)                                        ; Get the third word
 477                         lwz             r8,12(r6)                                       ; Get the fourth word
 478                         lwz             r9,16(r6)                                       ; Get the fifth word
 479                         lwz             r10,20(r6)                                      ; Get the sixth word
 480                         lwz             r11,24(r6)                                      ; Get the seventh word
 481                         lwz             r12,28(r6)                                      ; Get the eighth word
 482                         bt-             noncache,skipz                          ; Skip if we are not cached...
 483                         dcbz    0,r4                                            ; Blow away the whole line because we are replacing it
 484                         dcbt    r6,r0                                           ; Touch ahead a bit
 485 skipz:
 486                         addi    r6,r6,32                                        ; Point to the next
 487                         stw             r2,0(r4)                                        ; Save the first word
 488                         stw             r5,4(r4)                                        ; Save the second word
 489                         stw             r7,8(r4)                                        ; Save the third word
 490                         stw             r8,12(r4)                                       ; Save the fourth word
 491                         stw             r9,16(r4)                                       ; Save the fifth word
 492                         stw             r10,20(r4)                                      ; Save the sixth word
 493                         stw             r11,24(r4)                                      ; Save the seventh word
 494                         stw             r12,28(r4)                                      ; Save the eighth word
 495                         addi    r4,r4,32                                        ; Bump sink
 496                         bdnz+   nxtline                                         ; Do the next line, if any...
 497
 498
 499 ;                       Move backend quadword
 500
 501 backend:                                        ; Join here from "shortcopy" for forward moves <32 bytes
 502             bf          27,noquad                                       ; No quad to do...
 503                         lwz             r7,0(r6)                                        ; Get the first word
 504                         lwz             r8,4(r6)                                        ; Get the second word
 505                         lwz             r9,8(r6)                                        ; Get the third word
 506                         lwz             r11,12(r6)                                      ; Get the fourth word
 507                         stw             r7,0(r4)                                        ; Save the first word
 508                         addi    r6,r6,16                                        ; Point to the next
 509                         stw             r8,4(r4)                                        ; Save the second word
 510                         stw             r9,8(r4)                                        ; Save the third word
 511                         stw             r11,12(r4)                                      ; Save the fourth word
 512                         addi    r4,r4,16                                        ; Bump sink
 513
 514 ;                       Move backend double
 515
 516 noquad:         bf              28,nodouble                                     ; No double to do...
 517                         lwz             r7,0(r6)                                        ; Get the first word
 518                         lwz             r8,4(r6)                                        ; Get the second word
 519                         addi    r6,r6,8                                         ; Point to the next
 520                         stw             r7,0(r4)                                        ; Save the first word
 521                         stw             r8,4(r4)                                        ; Save the second word
 522                         addi    r4,r4,8                                         ; Bump sink
 523
 524 ;                       Move backend word
 525
 526 nodouble:       bf              29,noword                                       ; No word to do...
 527                         lwz             r7,0(r6)                                        ; Get the word
 528                         addi    r6,r6,4                                         ; Point to the next
 529                         stw             r7,0(r4)                                        ; Save the word
 530                         addi    r4,r4,4                                         ; Bump sink
 531
 532 ;                       Move backend halfword
 533
 534 noword:         bf              30,nohalf                                       ; No halfword to do...
 535                         lhz             r7,0(r6)                                        ; Get the halfword
 536                         addi    r6,r6,2                                         ; Point to the next
 537                         sth             r7,0(r4)                                        ; Save the halfword
 538                         addi    r4,r4,2                                         ; Bump sink
 539
 540 ;                       Move backend byte
 541
 542 nohalf:         bflr    31                          ; Leave cuz we are all done...
 543                         lbz             r7,0(r6)                                        ; Get the byte
 544                         stb             r7,0(r4)                                        ; Save the single
 545             blr
 546
 547
 548 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
 549 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 550 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 551 ; These are slower because we don't bother with dcbz.  Fortunately, reverse moves are uncommon.
 552 ;               r4 = destination
 553 ;               r5 = length (>0)
 554 ;               r6 = source
 555 ;               r8 = inverse of largest mask smaller than operand length
 556 ;               cr5 = noncache flag (but we don't dcbz anyway)
 557
 558 reverse32bit:                                                                   ; here from 64-bit code with word aligned uncached operands
 559             add         r4,r5,r4                                        ; Point past the last sink byte
 560                         add             r6,r5,r6                                        ; Point past the last source byte
 561                         rlwinm  r7,r4,0,0x1F                            ; Calculate the length to align dest on cache boundary
 562                         li              r12,-1                                          ; Make sure we touch in the actual line
 563                         andc.   r0,r7,r8                                        ; Apply movement limit
 564                         dcbt    r12,r6                                          ; Touch in the last line of source
 565             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 566                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
 567                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 568                         beq-    balline                                         ; Aready on cache line boundary (or too short to bother)
 569
 570                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
 571
 572                         bf              31,balhalf                                      ; No single byte to do...
 573                         lbz             r7,-1(r6)                                       ; Get the byte
 574                         subi    r6,r6,1                                         ; Point to the next
 575                         stb             r7,-1(r4)                                       ; Save the single
 576                         subi    r4,r4,1                                         ; Bump sink
 577
 578 ;                       Sink is halfword aligned here
 579
 580 balhalf:        bf              30,balword                                      ; No halfword to do...
 581                         lhz             r7,-2(r6)                                       ; Get the halfword
 582                         subi    r6,r6,2                                         ; Point to the next
 583                         sth             r7,-2(r4)                                       ; Save the halfword
 584                         subi    r4,r4,2                                         ; Bump sink
 585
 586 ;                       Sink is word aligned here
 587
 588 balword:        bf              29,baldouble                            ; No word to do...
 589                         lwz             r7,-4(r6)                                       ; Get the word
 590                         subi    r6,r6,4                                         ; Point to the next
 591                         stw             r7,-4(r4)                                       ; Save the word
 592                         subi    r4,r4,4                                         ; Bump sink
 593
 594 ;                       Sink is double aligned here
 595
 596 baldouble:      bf              28,balquad                                      ; No double to do...
 597                         lwz             r7,-8(r6)                                       ; Get the first word
 598                         lwz             r8,-4(r6)                                       ; Get the second word
 599                         subi    r6,r6,8                                         ; Point to the next
 600                         stw             r7,-8(r4)                                       ; Save the first word
 601                         stw             r8,-4(r4)                                       ; Save the second word
 602                         subi    r4,r4,8                                         ; Bump sink
 603
 604 ;                       Sink is quadword aligned here
 605
 606 balquad:        bf              27,balline                                      ; No quad to do...
 607                         lwz             r7,-16(r6)                                      ; Get the first word
 608                         lwz             r8,-12(r6)                                      ; Get the second word
 609                         lwz             r9,-8(r6)                                       ; Get the third word
 610                         lwz             r11,-4(r6)                                      ; Get the fourth word
 611                         stw             r7,-16(r4)                                      ; Save the first word
 612                         subi    r6,r6,16                                        ; Point to the next
 613                         stw             r8,-12(r4)                                      ; Save the second word
 614                         stw             r9,-8(r4)                                       ; Save the third word
 615                         stw             r11,-4(r4)                                      ; Save the fourth word
 616                         subi    r4,r4,16                                        ; Bump sink
 617
 618 ;                       Sink is line aligned here
 619
 620 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 621             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 622                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 623                         beq-    bbackend                                        ; No full lines to move
 624             mtctr   r0                          ; set up loop count
 625             b       bnxtline
 626
 627             .align  4
 628 bnxtline:
 629                         lwz             r7,-32(r6)                                      ; Get the first word
 630                         lwz             r5,-28(r6)                                      ; Get the second word
 631                         lwz             r2,-24(r6)                                      ; Get the third word
 632                         lwz             r12,-20(r6)                                     ; Get the third word
 633                         lwz             r11,-16(r6)                                     ; Get the fifth word
 634                         lwz             r10,-12(r6)                                     ; Get the sixth word
 635                         lwz             r9,-8(r6)                                       ; Get the seventh word
 636                         lwz             r8,-4(r6)                                       ; Get the eighth word
 637                         subi    r6,r6,32                                        ; Point to the next
 638
 639                         stw             r7,-32(r4)                                      ; Get the first word
 640             stw         r5,-28(r4)                                      ; Get the second word
 641                         stw             r2,-24(r4)                                      ; Get the third word
 642                         stw             r12,-20(r4)                                     ; Get the third word
 643                         stw             r11,-16(r4)                                     ; Get the fifth word
 644                         stw             r10,-12(r4)                                     ; Get the sixth word
 645                         stw             r9,-8(r4)                                       ; Get the seventh word
 646                         stw             r8,-4(r4)                                       ; Get the eighth word
 647                         subi    r4,r4,32                                        ; Bump sink
 648
 649                         bdnz+   bnxtline                                        ; Do the next line, if any...
 650
 651 ;
 652 ;                       Note: We touched these lines in at the beginning
 653 ;
 654
 655 ;                       Move backend quadword
 656
 657 bbackend:                                       ; Join here from "shortcopy" for reverse moves of <32 bytes
 658             bf          27,bnoquad                                      ; No quad to do...
 659                         lwz             r7,-16(r6)                                      ; Get the first word
 660                         lwz             r8,-12(r6)                                      ; Get the second word
 661                         lwz             r9,-8(r6)                                       ; Get the third word
 662                         lwz             r11,-4(r6)                                      ; Get the fourth word
 663                         stw             r7,-16(r4)                                      ; Save the first word
 664                         subi    r6,r6,16                                        ; Point to the next
 665                         stw             r8,-12(r4)                                      ; Save the second word
 666                         stw             r9,-8(r4)                                       ; Save the third word
 667                         stw             r11,-4(r4)                                      ; Save the fourth word
 668                         subi    r4,r4,16                                        ; Bump sink
 669
 670 ;                       Move backend double
 671
 672 bnoquad:        bf              28,bnodouble                            ; No double to do...
 673                         lwz             r7,-8(r6)                                       ; Get the first word
 674                         lwz             r8,-4(r6)                                       ; Get the second word
 675                         subi    r6,r6,8                                         ; Point to the next
 676                         stw             r7,-8(r4)                                       ; Save the first word
 677                         stw             r8,-4(r4)                                       ; Save the second word
 678                         subi    r4,r4,8                                         ; Bump sink
 679
 680 ;                       Move backend word
 681
 682 bnodouble:      bf              29,bnoword                                      ; No word to do...
 683                         lwz             r7,-4(r6)                                       ; Get the word
 684                         subi    r6,r6,4                                         ; Point to the next
 685                         stw             r7,-4(r4)                                       ; Save the word
 686                         subi    r4,r4,4                                         ; Bump sink
 687
 688 ;                       Move backend halfword
 689
 690 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
 691                         lhz             r7,-2(r6)                                       ; Get the halfword
 692                         subi    r6,r6,2                                         ; Point to the next
 693                         sth             r7,-2(r4)                                       ; Save the halfword
 694                         subi    r4,r4,2                                         ; Bump sink
 695
 696 ;                       Move backend byte
 697
 698 bnohalf:        bflr    31                          ; Leave cuz we are all done...
 699                         lbz             r7,-1(r6)                                       ; Get the byte
 700                         stb             r7,-1(r4)                                       ; Save the single
 701                         blr
 702
 703
 704 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
 705 // called either in 32 or 64-bit mode, which makes the test for reverse moves
 706 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
 707 // special cases.
 708 //
 709 // When entered:
 710 //              r4 = destination (32 or 64-bit ptr)
 711 //              r5 = length (always 32 bits)
 712 //              r6 = source (32 or 64-bit ptr)
 713 //      r12 = (dest - source), reverse move required if (dest-source)<length
 714 //              cr5 = noncache flag
 715
 716         .align  5
 717 copyit64:
 718         rlwinm  r7,r5,0,0,31        // truncate length to 32-bit, in case we're running in 64-bit mode
 719         cntlzw  r11,r5                          // get magnitude of length
 720         dcbt    0,r6                            // touch in 1st block of source
 721         dcbtst  0,r4                            // touch in 1st destination cache block
 722         subc    r7,r12,r7           // set Carry if (dest-source)>=length, in mode-independent way
 723         li      r0,0                // get a 0
 724         lis     r10,hi16(0x80000000)// get 0x80000000
 725         addze.  r0,r0               // set cr0 on carry bit (beq if reverse move required)
 726         neg     r9,r4               // start to get alignment for destination
 727         sraw    r8,r10,r11          // get mask based on operand length, to limit alignment
 728         bt--    noncache,c64uncached// skip if uncached
 729         beq--   c64rdouble          // handle cached reverse moves
 730
 731
 732 // Forward, cached or doubleword aligned uncached.  This is the common case.
 733 // NOTE: we never do an unaligned access if the source and destination are "relatively"
 734 // doubleword aligned.  We depend on this in the uncached case.
 735 //      r4 = destination
 736 //      r5 = length (>0)
 737 //      r6 = source
 738 //      r8 = inverse of largest mask smaller than operand length
 739 //      r9 = neg(dest), used to compute alignment
 740 //      cr5 = noncache flag
 741
 742 c64double:
 743         rlwinm  r7,r9,0,0x7F        // get #bytes to 128-byte align destination
 744         andc    r7,r7,r8            // limit by operand length
 745         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
 746         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
 747         sub             r5,r5,r7                        // adjust length remaining
 748         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
 749         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
 750         cmpwi   cr7,r10,0                       // set cr7 on chunk count
 751         beq             c64double2                      // dest already doubleword aligned
 752         mtctr   r8
 753         b               c64double1
 754
 755         .align  5                                       // align inner loops
 756 c64double1:                                                     // copy bytes until dest is doubleword aligned
 757         lbz             r0,0(r6)
 758         addi    r6,r6,1
 759         stb             r0,0(r4)
 760         addi    r4,r4,1
 761         bdnz    c64double1
 762
 763 c64double2:                                                     // r9/cr1=doublewords, r10/cr7=128-byte chunks
 764         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
 765         mtctr   r9
 766         b               c64double3
 767
 768         .align  5                                       // align inner loops
 769 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
 770         ld              r7,0(r6)
 771         addi    r6,r6,8
 772         std             r7,0(r4)
 773         addi    r4,r4,8
 774         bdnz    c64double3
 775
 776 // Here to xfer 128-byte chunks, if any.  Since we only have 8 GPRs for
 777 // data (64 bytes), we load/store each twice per 128-byte chunk.
 778
 779 c64double4:                                                     // r10/cr7=128-byte chunks
 780         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
 781         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
 782         beq             cr7,c64double7          // no 128-byte chunks
 783
 784         ; We must check for (source-dest)<128 in a mode-independent way.  If within 128 bytes,
 785         ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
 786
 787         sub             r8,r6,r4                        // r8 <- (source - dest)
 788         rldicr. r0,r8,0,63-7        // zero low 7 bits and check for 0, mode independent
 789         cror    noncache,cr0_eq,noncache        // turn on "noncache" flag if (source-dest)<128
 790         mtctr   r10
 791         b               c64InnerLoop
 792
 793         .align  5                                       // align inner loop
 794 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
 795         ld              r0,0(r6)                        // start pipe: load 1st half-line
 796         ld              r2,8(r6)
 797         ld              r7,16(r6)
 798         ld              r8,24(r6)
 799         ld              r9,32(r6)
 800         ld              r10,40(r6)
 801         ld              r11,48(r6)
 802         ld              r12,56(r6)
 803         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
 804         dcbz128 0,r4                            // avoid prefetch of next cache line
 805 c64InnerLoop1:
 806
 807         std             r0,0(r4)
 808         std             r2,8(r4)
 809         std             r7,16(r4)
 810         std             r8,24(r4)
 811         std             r9,32(r4)
 812         std             r10,40(r4)
 813         std             r11,48(r4)
 814         std             r12,56(r4)
 815
 816         ld              r0,64(r6)                       // load 2nd half of chunk
 817         ld              r2,72(r6)
 818         ld              r7,80(r6)
 819         ld              r8,88(r6)
 820         ld              r9,96(r6)
 821         ld              r10,104(r6)
 822         ld              r11,112(r6)
 823         ld              r12,120(r6)
 824         addi    r6,r6,128
 825
 826         std             r0,64(r4)
 827         std             r2,72(r4)
 828         std             r7,80(r4)
 829         std             r8,88(r4)
 830         std             r9,96(r4)
 831         std             r10,104(r4)
 832         std             r11,112(r4)
 833         std             r12,120(r4)
 834         addi    r4,r4,128                       // advance to next dest chunk
 835
 836         bdnz    c64InnerLoop            // loop if more chunks
 837
 838
 839 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
 840         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
 841         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
 842         beq             cr1,c64byte                     // no leftover doublewords
 843         mtctr   r0
 844         b               c64double8
 845
 846         .align  5                                       // align inner loop
 847 c64double8:                                                     // loop copying leftover doublewords
 848         ld              r0,0(r6)
 849         addi    r6,r6,8
 850         std             r0,0(r4)
 851         addi    r4,r4,8
 852         bdnz    c64double8
 853
 854
 855 // Forward byte loop.
 856
 857 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
 858                 beqlr                       // done if no leftover bytes
 859         mtctr   r5
 860         b               c64byte1
 861
 862         .align  5                                       // align inner loop
 863 c64byte1:
 864         lbz             r0,0(r6)
 865         addi    r6,r6,1
 866         stb             r0,0(r4)
 867         addi    r4,r4,1
 868         bdnz    c64byte1
 869
 870         blr
 871
 872
 873 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
 874 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
 875 // a byte at a time, but that is still much faster than alignment exceptions.
 876 //      r4 = destination
 877 //      r5 = length (>0)
 878 //      r6 = source
 879 //      r8 = inverse of largest mask smaller than operand length
 880 //      r9 = neg(dest), used to compute alignment
 881 //      r12 = (dest-source), used to test relative alignment
 882 //      cr0 = beq if reverse move required
 883 //      cr5 = noncache flag
 884
 885 c64uncached:
 886         rlwinm  r10,r12,0,29,31         // relatively doubleword aligned?
 887         rlwinm  r11,r12,0,30,31         // relatively word aligned?
 888         cmpwi   cr7,r10,0                       // set cr7 beq if doubleword aligned
 889         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
 890         beq--   c64reverseUncached
 891
 892         beq             cr7,c64double           // doubleword aligned
 893         beq             cr1,forward32bit    // word aligned, use G3/G4 code
 894         cmpwi   r5,0                            // set cr0 on byte count
 895         b               c64byte                         // unaligned operands
 896
 897 c64reverseUncached:
 898         beq             cr7,c64rdouble          // doubleword aligned so can use LD/STD
 899         beq             cr1,reverse32bit        // word aligned, use G3/G4 code
 900         add             r6,r6,r5                        // point to (end+1) of source and dest
 901         add             r4,r4,r5
 902         cmpwi   r5,0                            // set cr0 on length
 903         b               c64rbyte                        // copy a byte at a time
 904
 905
 906
 907 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
 908 // aligned uncached copies.
 909 //      r4 = destination
 910 //      r5 = length (>0)
 911 //      r6 = source
 912 //      r8 = inverse of largest mask of low-order 1s smaller than operand length
 913 //      cr5 = noncache flag
 914
 915 c64rdouble:
 916         add             r6,r6,r5                        // point to (end+1) of source and dest
 917         add             r4,r4,r5
 918         rlwinm  r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
 919         andc.   r7,r7,r8            // limit by operand length
 920         sub             r5,r5,r7                        // adjust length
 921         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
 922         cmpwi   cr1,r8,0                        // any chunks?
 923         beq             c64rd2                          // source already doubleword aligned
 924         mtctr   r7
 925
 926 c64rd1:                                                         // copy bytes until source doublword aligned
 927         lbzu    r0,-1(r6)
 928         stbu    r0,-1(r4)
 929         bdnz    c64rd1
 930
 931 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
 932         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
 933         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
 934         cmpwi   cr7,r0,0                        // leftover doublewords?
 935         beq             cr1,c64rd4                      // no chunks to xfer
 936         mtctr   r8
 937         b               c64rd3
 938
 939         .align  5                                       // align inner loop
 940 c64rd3:                                                         // loop copying 64-byte chunks
 941         ld              r7,-8(r6)
 942         ld              r8,-16(r6)
 943         ld              r9,-24(r6)
 944         ld              r10,-32(r6)
 945         ld              r11,-40(r6)
 946         ld              r12,-48(r6)
 947         std             r7,-8(r4)
 948         std             r8,-16(r4)
 949         ld              r7,-56(r6)
 950         ldu             r8,-64(r6)
 951         std             r9,-24(r4)
 952         std             r10,-32(r4)
 953         std             r11,-40(r4)
 954         std             r12,-48(r4)
 955         std             r7,-56(r4)
 956         stdu    r8,-64(r4)
 957         bdnz    c64rd3
 958
 959 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
 960         beq             cr7,c64rbyte            // no leftover doublewords
 961         mtctr   r0
 962
 963 c64rd5:                                                         // loop copying leftover doublewords
 964         ldu             r0,-8(r6)
 965         stdu    r0,-8(r4)
 966         bdnz    c64rd5
 967
 968
 969 // Reverse byte loop.
 970
 971 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
 972         beqlr                       // done if no leftover bytes
 973         mtctr   r5
 974
 975 c64rbyte1:
 976         lbzu    r0,-1(r6)
 977         stbu    r0,-1(r4)
 978         bdnz    c64rbyte1
 979
 980         blr
 981