osfmk/ppc/bcopy.s

   1 /*
   2  * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. Please obtain a copy of the License at
  10  * http://www.opensource.apple.com/apsl/ and read it before using this
  11  * file.
  12  *
  13  * The Original Code and all software distributed under the License are
  14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18  * Please see the License for the specific language governing rights and
  19  * limitations under the License.
  20  *
  21  * @APPLE_LICENSE_HEADER_END@
  22  */
  23 ;
  24 ;                       Copy bytes of data around. Handles overlapped data.
  25 ;
  26 ;
  27 #include <ppc/asm.h>
  28 #include <ppc/proc_reg.h>
  29 #include <assym.s>
  30
  31 ;       These routines use CR5 for certain flags:
  32 ;               Use CR5_lt to indicate non-cached (in bcopy and memcpy)
  33 #define noncache 20
  34
  35
  36 ;       The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
  37 #define BCOPY_SF_SIZE   32      // total size
  38 #define BCOPY_SF_MSR    16      // we save caller's MSR here (possibly minus VEC and FP)
  39
  40
  41 #define kShort  32              // short operands are special cased
  42
  43
  44 ; void bcopy_physvir_32(from, to, nbytes)
  45 ;
  46 ; Attempt to copy physically addressed memory with translation on if conditions are met.
  47 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors
  48 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
  49 ; for the passed phys addrs and do the copy with translation on.
  50 ;
  51 ; Rules are: - neither source nor destination can cross a page.
  52 ;            - Interrupts must be disabled when this routine is called.
  53 ;            - Translation must be on when called.
  54 ;
  55 ; To do the copy, we build a 128 DBAT for both the source and sink.  If both are the same, only one
  56 ; is loaded.  We do not touch the IBATs, so there is no issue if either physical page
  57 ; address is the same as the virtual address of the instructions we are executing.
  58 ;
  59 ; At the end, we invalidate the used DBATs.
  60 ;
  61 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  62 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  63 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  64 ; there anyhow.
  65 ;
  66 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
  67 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
  68 ; for an example of how this is done.
  69
  70                         .align  5
  71                         .globl  EXT(bcopy_physvir_32)
  72
  73 LEXT(bcopy_physvir_32)
  74             mflr    r0                          ; get return address
  75             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  76             mfsprg      r8,2                                            ; get processor feature flags
  77             stw     r0,8(r1)                    ; save return address
  78                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  79             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
  80             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  81             subi    r0,r7,1                     ; get length - 1
  82                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  83                         add             r11,r3,r0                                       ; Point to last byte of sink
  84                         mr              r5,r7                                           ; Get the length into the right register
  85             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  86
  87 ; This test for page overflow may not work if the length is negative.  Negative lengths are invalid input
  88 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
  89
  90                         add             r12,r4,r0                                       ; Point to last byte of source
  91                         xor             r7,r11,r3                                       ; See if we went to next page
  92                         xor             r8,r12,r4                                       ; See if we went to next page
  93                         or              r0,r7,r8                                        ; Combine wrap
  94
  95 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
  96                         li              r9,((2<<3)|2)                           ; Set default attributes
  97                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
  98                         li              r7,2                                            ; Set validity flags
  99                         li              r8,2                                            ; Set validity flags
 100                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
 101
 102                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
 103                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
 104                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
 105                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
 106                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
 107
 108                         sync
 109
 110                         mtdbatl 0,r11                                           ; Set sink lower DBAT
 111                         mtdbatu 0,r7                                            ; Set sink upper DBAT
 112
 113                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
 114
 115                         mtdbatl 1,r12                                           ; Set source lower DBAT
 116                         mtdbatu 1,r8                                            ; Set source upper DBAT
 117
 118 bcpvsame:
 119             sync                                ; wait for the BATs to stabilize
 120             isync
 121
 122             bl      EXT(bcopy)                  ; BATs set up, args in r3-r5, so do the copy with DR on
 123
 124             li          r0,0                                            ; Get set to invalidate upper half of BATs
 125                         sync                                                            ; Make sure all is well
 126                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
 127                         mtdbatu 1,r0                                            ; Clear source upper DBAT
 128                         sync
 129                         isync
 130
 131             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address
 132             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 133             mtlr    r0
 134             blr
 135
 136
 137 ; void bcopy_phys(from, to, nbytes)
 138 ;
 139 ; Turns off data translation before the copy.  This one will not work in user state.
 140 ; This routine is used on 32 and 64-bit machines.
 141 ;
 142 ; Note that the address parameters are long longs.  We will transform these to 64-bit
 143 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
 144 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
 145 ; there anyhow.
 146 ;
 147 ; Also note that you probably will not be happy if either the sink or source spans across the
 148 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code
 149 ; will not check, so be careful.
 150 ;
 151 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
 152 ;       Interrupts may or may not be disabled.
 153
 154                         .align  5
 155                         .globl  EXT(bcopy_phys)
 156
 157 LEXT(bcopy_phys)
 158             mflr    r0                          ; get return address
 159             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 160             stw     r0,8(r1)                    ; save
 161             mfsprg      r8,2                                            ; get processor feature flags
 162             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
 163                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
 164                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 165                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
 166                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
 167                         mr              r5,r7                                           ; Get the length into the right register
 168
 169 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
 170                         mfmsr   r9                                                      ; Get the MSR
 171                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
 172             ori     r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))       ; Add in FP and DR
 173             andc    r9,r9,r6                    ; unconditionally turn DR, VEC, and FP off
 174             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
 175
 176 ; 32-bit CPUs
 177
 178                         mtmsr   r9                                                      ; turn DR, FP, and VEC off
 179                         isync                                                           ; Wait for it
 180
 181             bl      EXT(bcopy)                  ; do the copy with translation off and caching on
 182
 183                         mfmsr   r9                                                      ; Get the MSR
 184             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on (but leave VEC and FP off)
 185             mtmsr   r9                          ; restore msr
 186             isync                               ; wait for it to happen
 187             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 188             mtlr    r0
 189             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 190             blr
 191
 192
 193 ; 64-bit: turn DR off and SF on.
 194
 195 bcopy_phys64:                                                                   ; r9 = MSR with DP, VEC, and FP off
 196             ori     r8,r9,lo16(MASK(MSR_DR))    ; make a copy with DR back on... this is what we return to caller
 197                         srdi    r2,r3,31                                        ; Get a 1 if source is in I/O memory
 198             li          r0,1                                            ; Note - we use this in a couple places below
 199                         srdi    r10,r4,31                                       ; Get a 1 if sink is in I/O memory
 200             std     r8,BCOPY_SF_MSR(r1)         ; save caller's MSR so we remember whether EE was on
 201             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on in MSR we will copy with
 202                         cmpldi  cr0,r2,1                                        ; Is source in I/O memory?
 203                         cmpldi  cr7,r10,1                                       ; Is sink in I/O memory?
 204             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
 205             isync                                                               ; wait for it to happen
 206                         cror    cr7_eq,cr0_eq,cr7_eq            ; See if either source or sink is in I/O area
 207             beq--   cr7,io_space_real_mode_copy ; an operand is in I/O space
 208
 209             bl      EXT(bcopy)                  ; do copy with DR off and SF on, cache enabled
 210
 211 bcopy_phys64x:
 212                         mfmsr   r9                                                      ; Get the MSR we used to copy
 213             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
 214             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on
 215             mtmsrd  r9                          ; turn 64-bit mode off, translation back on
 216             isync                                                               ; wait for it to happen
 217             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 218             ld      r8,BCOPY_SF_MSR(r1)         ; get caller's MSR once translation is back on
 219             mtlr    r0
 220             mtmsrd  r8,1                        ; turn EE back on if necessary
 221             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 222             blr
 223
 224 ;   We need to copy with DR off, but one of the operands is in I/O space.  To avoid wedging U3,
 225 ;   which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
 226 ;   This can only be done by setting bits in HID4.  We cannot lose control and execute random code in
 227 ;   this state, so we have to disable interrupts as well.  This is an unpleasant hack.
 228
 229 io_space_real_mode_copy:                        ; r0=1, r9=MSR we want to copy with
 230                         sldi    r11,r0,31-MSR_EE_BIT            ; Get a mask for the EE bit
 231                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 232                         andc    r9,r9,r11                                       ; Turn off EE bit
 233                         mfspr   r2,hid4                                         ; Get HID4
 234                         mtmsrd  r9,1                        ; Force off EE
 235                         or              r2,r2,r0                                        ; Set bit to make real accesses cache-inhibited
 236                         sync                                                            ; Sync up
 237                         mtspr   hid4,r2                                         ; Make real accesses cache-inhibited
 238                         isync                                                           ; Toss prefetches
 239
 240                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 241                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 242                         slbie   r12                                                     ; Make sure the ERAT is cleared
 243
 244                         sync
 245                         isync
 246
 247             bl      EXT(bcopy_nc)               ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
 248
 249                         li              r0,1                                            ; Get a 1
 250                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 251                         mfspr   r2,hid4                                         ; Get HID4
 252                         andc    r2,r2,r0                                        ; Clear bit to make real accesses cache-inhibited
 253                         sync                                                            ; Sync up
 254                         mtspr   hid4,r2                                         ; Make real accesses not cache-inhibited
 255                         isync                                                           ; Toss prefetches
 256
 257                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 258                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 259                         slbie   r12                                                     ; Make sure the ERAT is cleared
 260             b       bcopy_phys64x
 261
 262
 263 ;
 264 ; shortcopy
 265 ;
 266 ; Special case short operands (<32 bytes), which are very common.  Note that the check for
 267 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
 268 ; reverse when it wasn't necessary to do so.  This is OK, since performance of the two cases
 269 ; is similar.  We do get the direction right when it counts (ie, when the operands overlap.)
 270 ; Also note that we use the G3/G4 "backend" code, even on G5.  This is OK too, since G5 has
 271 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
 272 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
 273 ; which cost about 20 cycles if they cross a 32-byte boundary on G5.  Finally, because we
 274 ; might do unaligned accesses this code cannot be called from bcopy_nc().
 275 ;           r4 = destination
 276 ;           r5 = length (<32)
 277 ;           r6 = source
 278 ;           r12 = (dest - source)
 279
 280             .align  5
 281 shortcopy:
 282             cmplw   r12,r5                      ; must move reverse if (dest-source)<length
 283             mtcrf   2,r5                        ; move length to cr6 and cr7 one at a time...
 284             mtcrf   1,r5                        ; ...which is faster on G4 and G5
 285             bge++   backend                     ; handle forward moves (most common case)
 286             add     r6,r6,r5                    ; point one past end of operands in reverse moves
 287             add     r4,r4,r5
 288             b       bbackend                    ; handle reverse moves
 289
 290 ;
 291 ; void bcopy(from, to, nbytes)
 292 ;
 293 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
 294 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
 295 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
 296 ; to the thread_recover routine.  What this means is that it would be hard to use vector or floating point
 297 ; registers to accelerate the copy.
 298 ;
 299 ; NOTE: this code can be called in any of three "modes":
 300 ;       - on 32-bit processors (32-byte cache line)
 301 ;       - on 64-bit processors running in 32-bit mode (128-byte cache line)
 302 ;       - on 64-bit processors running in 64-bit mode (128-byte cache line)
 303
 304                         .align  5
 305                         .globl  EXT(bcopy)
 306             .globl  EXT(bcopy_nop_if_32bit)
 307
 308 LEXT(bcopy)
 309                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 310             sub.    r12,r4,r3                                   ; test for to==from in mode-independent way, start fwd/rev check
 311                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 312                         blt     cr1,shortcopy               ; special case short operands
 313                         crclr   noncache                                        ; Set cached
 314 LEXT(bcopy_nop_if_32bit)
 315             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 316                         bne+    copyit32                                        ; handle 32-bit processor
 317             blr                                 ; to==from so nothing to do
 318
 319 ;
 320 ; bcopy_nc(from, to, nbytes)
 321 ;
 322 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
 323 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
 324 ; alignment exceptions.  Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
 325 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
 326
 327                         .align  5
 328                         .globl  EXT(bcopy_nc)
 329             .globl  EXT(bcopy_nc_nop_if_32bit)
 330
 331 LEXT(bcopy_nc)
 332                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
 333             sub.        r12,r4,r3                                       ; test for to==from in mode-independent way, start fwd/rev check
 334                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 335                         crset   noncache                                        ; Set non-cached
 336                         cror    cr0_eq,cr1_eq,cr0_eq        ; set cr0 beq if either length zero or to==from
 337 LEXT(bcopy_nc_nop_if_32bit)
 338             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 339                         bne+    copyit32                                        ; handle 32-bit processor
 340             blr                                 ; either zero length or to==from
 341
 342 ;
 343 ; void* memcpy(to, from, nbytes)
 344 ; void* memmove(to, from, nbytes)
 345 ;
 346 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
 347 ; However, they would work correctly if called in 64-bit mode.
 348
 349                         .align  5
 350                         .globl  EXT(memcpy)
 351                         .globl  EXT(memmove)
 352             .globl  EXT(memcpy_nop_if_32bit)
 353
 354 LEXT(memcpy)
 355 LEXT(memmove)
 356                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 357             sub.    r12,r3,r4                                   ; test for to==from in mode-independent way, start fwd/rev check
 358                         mr              r6,r4                                           ; Set source
 359                         mr              r4,r3                                           ; Set the "to" (must preserve r3 for return value)
 360                         blt     cr1,shortcopy               ; special case short operands
 361                         crclr   noncache                                        ; Set cached
 362 LEXT(memcpy_nop_if_32bit)
 363             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 364                         beqlr-                              ; exit if to==from
 365
 366
 367 ;       Here to copy on 32-bit processors.
 368 ;
 369 ;                       When we move the memory, forward overlays must be handled.  We
 370 ;                       also can not use the cache instructions if we are from bcopy_nc.
 371 ;                       We need to preserve R3 because it needs to be returned for memcpy.
 372 ;                       We can be interrupted and lose control here.
 373 ;
 374 ;           When entered:
 375 ;               r4 = destination
 376 ;               r5 = length (>0)
 377 ;               r6 = source
 378 ;               r12 = (dest - source)
 379 ;               cr5 = noncache flag
 380
 381 copyit32:                                       ; WARNING! can drop down to this label
 382             cmplw   cr1,r12,r5                  ; must move reverse if (dest-source)<length
 383             cntlzw  r11,r5                      ; get magnitude of length
 384             dcbt    0,r6                        ; start to touch in source
 385             lis     r10,hi16(0x80000000)        ; get 0x80000000
 386             neg     r9,r4                       ; start to get alignment for destination
 387             dcbtst  0,r4                        ; start to touch in destination
 388             sraw    r8,r10,r11                  ; get mask based on operand length, to limit alignment
 389             blt-    cr1,reverse32bit            ; reverse move required
 390
 391 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
 392 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 393 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 394 ;               r4 = destination
 395 ;               r5 = length (>0)
 396 ;               r6 = source
 397 ;               r8 = inverse of largest mask smaller than operand length
 398 ;               r9 = neg(dest), used to compute alignment
 399 ;               cr5 = noncache flag
 400
 401 forward32bit:                                   ; enter from 64-bit CPUs with word aligned uncached operands
 402                         rlwinm  r7,r9,0,0x1F                            ; get bytes to 32-byte-align destination
 403                         andc.   r0,r7,r8                                        ; limit to the maximum front end move
 404             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 405                         beq             alline                                          ; Already on a line...
 406
 407                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 408                         sub             r5,r5,r0                                        ; Set the length left to move
 409
 410                         bf              31,alhalf                                       ; No single byte to do...
 411                         lbz             r7,0(r6)                                        ; Get the byte
 412                         addi    r6,r6,1                                         ; Point to the next
 413                         stb             r7,0(r4)                                        ; Save the single
 414                         addi    r4,r4,1                                         ; Bump sink
 415
 416 ;                       Sink is halfword aligned here
 417
 418 alhalf:         bf              30,alword                                       ; No halfword to do...
 419                         lhz             r7,0(r6)                                        ; Get the halfword
 420                         addi    r6,r6,2                                         ; Point to the next
 421                         sth             r7,0(r4)                                        ; Save the halfword
 422                         addi    r4,r4,2                                         ; Bump sink
 423
 424 ;                       Sink is word aligned here
 425
 426 alword:         bf              29,aldouble                                     ; No word to do...
 427                         lwz             r7,0(r6)                                        ; Get the word
 428                         addi    r6,r6,4                                         ; Point to the next
 429                         stw             r7,0(r4)                                        ; Save the word
 430                         addi    r4,r4,4                                         ; Bump sink
 431
 432 ;                       Sink is double aligned here
 433
 434 aldouble:       bf              28,alquad                                       ; No double to do...
 435                         lwz             r7,0(r6)                                        ; Get the first word
 436                         lwz             r8,4(r6)                                        ; Get the second word
 437                         addi    r6,r6,8                                         ; Point to the next
 438                         stw             r7,0(r4)                                        ; Save the first word
 439                         stw             r8,4(r4)                                        ; Save the second word
 440                         addi    r4,r4,8                                         ; Bump sink
 441
 442 ;                       Sink is quadword aligned here
 443
 444 alquad:         bf              27,alline                                       ; No quad to do...
 445                         lwz             r7,0(r6)                                        ; Get the first word
 446                         lwz             r8,4(r6)                                        ; Get the second word
 447                         lwz             r9,8(r6)                                        ; Get the third word
 448                         stw             r7,0(r4)                                        ; Save the first word
 449                         lwz             r11,12(r6)                                      ; Get the fourth word
 450                         addi    r6,r6,16                                        ; Point to the next
 451                         stw             r8,4(r4)                                        ; Save the second word
 452                         stw             r9,8(r4)                                        ; Save the third word
 453                         stw             r11,12(r4)                                      ; Save the fourth word
 454                         addi    r4,r4,16                                        ; Bump sink
 455
 456 ;                       Sink is line aligned here
 457
 458 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 459             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 460                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 461                         beq-    backend                                         ; No full lines to move
 462
 463             mtctr   r0                          ; set up loop count
 464                         li              r0,96                                           ; Stride for touch ahead
 465             b       nxtline
 466
 467             .align  4
 468 nxtline:
 469             lwz         r2,0(r6)                                        ; Get the first word
 470                         lwz             r5,4(r6)                                        ; Get the second word
 471                         lwz             r7,8(r6)                                        ; Get the third word
 472                         lwz             r8,12(r6)                                       ; Get the fourth word
 473                         lwz             r9,16(r6)                                       ; Get the fifth word
 474                         lwz             r10,20(r6)                                      ; Get the sixth word
 475                         lwz             r11,24(r6)                                      ; Get the seventh word
 476                         lwz             r12,28(r6)                                      ; Get the eighth word
 477                         bt-             noncache,skipz                          ; Skip if we are not cached...
 478                         dcbz    0,r4                                            ; Blow away the whole line because we are replacing it
 479                         dcbt    r6,r0                                           ; Touch ahead a bit
 480 skipz:
 481                         addi    r6,r6,32                                        ; Point to the next
 482                         stw             r2,0(r4)                                        ; Save the first word
 483                         stw             r5,4(r4)                                        ; Save the second word
 484                         stw             r7,8(r4)                                        ; Save the third word
 485                         stw             r8,12(r4)                                       ; Save the fourth word
 486                         stw             r9,16(r4)                                       ; Save the fifth word
 487                         stw             r10,20(r4)                                      ; Save the sixth word
 488                         stw             r11,24(r4)                                      ; Save the seventh word
 489                         stw             r12,28(r4)                                      ; Save the eighth word
 490                         addi    r4,r4,32                                        ; Bump sink
 491                         bdnz+   nxtline                                         ; Do the next line, if any...
 492
 493
 494 ;                       Move backend quadword
 495
 496 backend:                                        ; Join here from "shortcopy" for forward moves <32 bytes
 497             bf          27,noquad                                       ; No quad to do...
 498                         lwz             r7,0(r6)                                        ; Get the first word
 499                         lwz             r8,4(r6)                                        ; Get the second word
 500                         lwz             r9,8(r6)                                        ; Get the third word
 501                         lwz             r11,12(r6)                                      ; Get the fourth word
 502                         stw             r7,0(r4)                                        ; Save the first word
 503                         addi    r6,r6,16                                        ; Point to the next
 504                         stw             r8,4(r4)                                        ; Save the second word
 505                         stw             r9,8(r4)                                        ; Save the third word
 506                         stw             r11,12(r4)                                      ; Save the fourth word
 507                         addi    r4,r4,16                                        ; Bump sink
 508
 509 ;                       Move backend double
 510
 511 noquad:         bf              28,nodouble                                     ; No double to do...
 512                         lwz             r7,0(r6)                                        ; Get the first word
 513                         lwz             r8,4(r6)                                        ; Get the second word
 514                         addi    r6,r6,8                                         ; Point to the next
 515                         stw             r7,0(r4)                                        ; Save the first word
 516                         stw             r8,4(r4)                                        ; Save the second word
 517                         addi    r4,r4,8                                         ; Bump sink
 518
 519 ;                       Move backend word
 520
 521 nodouble:       bf              29,noword                                       ; No word to do...
 522                         lwz             r7,0(r6)                                        ; Get the word
 523                         addi    r6,r6,4                                         ; Point to the next
 524                         stw             r7,0(r4)                                        ; Save the word
 525                         addi    r4,r4,4                                         ; Bump sink
 526
 527 ;                       Move backend halfword
 528
 529 noword:         bf              30,nohalf                                       ; No halfword to do...
 530                         lhz             r7,0(r6)                                        ; Get the halfword
 531                         addi    r6,r6,2                                         ; Point to the next
 532                         sth             r7,0(r4)                                        ; Save the halfword
 533                         addi    r4,r4,2                                         ; Bump sink
 534
 535 ;                       Move backend byte
 536
 537 nohalf:         bflr    31                          ; Leave cuz we are all done...
 538                         lbz             r7,0(r6)                                        ; Get the byte
 539                         stb             r7,0(r4)                                        ; Save the single
 540             blr
 541
 542
 543 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
 544 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 545 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 546 ; These are slower because we don't bother with dcbz.  Fortunately, reverse moves are uncommon.
 547 ;               r4 = destination
 548 ;               r5 = length (>0)
 549 ;               r6 = source
 550 ;               r8 = inverse of largest mask smaller than operand length
 551 ;               cr5 = noncache flag (but we don't dcbz anyway)
 552
 553 reverse32bit:                                                                   ; here from 64-bit code with word aligned uncached operands
 554             add         r4,r5,r4                                        ; Point past the last sink byte
 555                         add             r6,r5,r6                                        ; Point past the last source byte
 556                         rlwinm  r7,r4,0,0x1F                            ; Calculate the length to align dest on cache boundary
 557                         li              r12,-1                                          ; Make sure we touch in the actual line
 558                         andc.   r0,r7,r8                                        ; Apply movement limit
 559                         dcbt    r12,r6                                          ; Touch in the last line of source
 560             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 561                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
 562                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 563                         beq-    balline                                         ; Aready on cache line boundary (or too short to bother)
 564
 565                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
 566
 567                         bf              31,balhalf                                      ; No single byte to do...
 568                         lbz             r7,-1(r6)                                       ; Get the byte
 569                         subi    r6,r6,1                                         ; Point to the next
 570                         stb             r7,-1(r4)                                       ; Save the single
 571                         subi    r4,r4,1                                         ; Bump sink
 572
 573 ;                       Sink is halfword aligned here
 574
 575 balhalf:        bf              30,balword                                      ; No halfword to do...
 576                         lhz             r7,-2(r6)                                       ; Get the halfword
 577                         subi    r6,r6,2                                         ; Point to the next
 578                         sth             r7,-2(r4)                                       ; Save the halfword
 579                         subi    r4,r4,2                                         ; Bump sink
 580
 581 ;                       Sink is word aligned here
 582
 583 balword:        bf              29,baldouble                            ; No word to do...
 584                         lwz             r7,-4(r6)                                       ; Get the word
 585                         subi    r6,r6,4                                         ; Point to the next
 586                         stw             r7,-4(r4)                                       ; Save the word
 587                         subi    r4,r4,4                                         ; Bump sink
 588
 589 ;                       Sink is double aligned here
 590
 591 baldouble:      bf              28,balquad                                      ; No double to do...
 592                         lwz             r7,-8(r6)                                       ; Get the first word
 593                         lwz             r8,-4(r6)                                       ; Get the second word
 594                         subi    r6,r6,8                                         ; Point to the next
 595                         stw             r7,-8(r4)                                       ; Save the first word
 596                         stw             r8,-4(r4)                                       ; Save the second word
 597                         subi    r4,r4,8                                         ; Bump sink
 598
 599 ;                       Sink is quadword aligned here
 600
 601 balquad:        bf              27,balline                                      ; No quad to do...
 602                         lwz             r7,-16(r6)                                      ; Get the first word
 603                         lwz             r8,-12(r6)                                      ; Get the second word
 604                         lwz             r9,-8(r6)                                       ; Get the third word
 605                         lwz             r11,-4(r6)                                      ; Get the fourth word
 606                         stw             r7,-16(r4)                                      ; Save the first word
 607                         subi    r6,r6,16                                        ; Point to the next
 608                         stw             r8,-12(r4)                                      ; Save the second word
 609                         stw             r9,-8(r4)                                       ; Save the third word
 610                         stw             r11,-4(r4)                                      ; Save the fourth word
 611                         subi    r4,r4,16                                        ; Bump sink
 612
 613 ;                       Sink is line aligned here
 614
 615 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 616             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 617                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 618                         beq-    bbackend                                        ; No full lines to move
 619             mtctr   r0                          ; set up loop count
 620             b       bnxtline
 621
 622             .align  4
 623 bnxtline:
 624                         lwz             r7,-32(r6)                                      ; Get the first word
 625                         lwz             r5,-28(r6)                                      ; Get the second word
 626                         lwz             r2,-24(r6)                                      ; Get the third word
 627                         lwz             r12,-20(r6)                                     ; Get the third word
 628                         lwz             r11,-16(r6)                                     ; Get the fifth word
 629                         lwz             r10,-12(r6)                                     ; Get the sixth word
 630                         lwz             r9,-8(r6)                                       ; Get the seventh word
 631                         lwz             r8,-4(r6)                                       ; Get the eighth word
 632                         subi    r6,r6,32                                        ; Point to the next
 633
 634                         stw             r7,-32(r4)                                      ; Get the first word
 635             stw         r5,-28(r4)                                      ; Get the second word
 636                         stw             r2,-24(r4)                                      ; Get the third word
 637                         stw             r12,-20(r4)                                     ; Get the third word
 638                         stw             r11,-16(r4)                                     ; Get the fifth word
 639                         stw             r10,-12(r4)                                     ; Get the sixth word
 640                         stw             r9,-8(r4)                                       ; Get the seventh word
 641                         stw             r8,-4(r4)                                       ; Get the eighth word
 642                         subi    r4,r4,32                                        ; Bump sink
 643
 644                         bdnz+   bnxtline                                        ; Do the next line, if any...
 645
 646 ;
 647 ;                       Note: We touched these lines in at the beginning
 648 ;
 649
 650 ;                       Move backend quadword
 651
 652 bbackend:                                       ; Join here from "shortcopy" for reverse moves of <32 bytes
 653             bf          27,bnoquad                                      ; No quad to do...
 654                         lwz             r7,-16(r6)                                      ; Get the first word
 655                         lwz             r8,-12(r6)                                      ; Get the second word
 656                         lwz             r9,-8(r6)                                       ; Get the third word
 657                         lwz             r11,-4(r6)                                      ; Get the fourth word
 658                         stw             r7,-16(r4)                                      ; Save the first word
 659                         subi    r6,r6,16                                        ; Point to the next
 660                         stw             r8,-12(r4)                                      ; Save the second word
 661                         stw             r9,-8(r4)                                       ; Save the third word
 662                         stw             r11,-4(r4)                                      ; Save the fourth word
 663                         subi    r4,r4,16                                        ; Bump sink
 664
 665 ;                       Move backend double
 666
 667 bnoquad:        bf              28,bnodouble                            ; No double to do...
 668                         lwz             r7,-8(r6)                                       ; Get the first word
 669                         lwz             r8,-4(r6)                                       ; Get the second word
 670                         subi    r6,r6,8                                         ; Point to the next
 671                         stw             r7,-8(r4)                                       ; Save the first word
 672                         stw             r8,-4(r4)                                       ; Save the second word
 673                         subi    r4,r4,8                                         ; Bump sink
 674
 675 ;                       Move backend word
 676
 677 bnodouble:      bf              29,bnoword                                      ; No word to do...
 678                         lwz             r7,-4(r6)                                       ; Get the word
 679                         subi    r6,r6,4                                         ; Point to the next
 680                         stw             r7,-4(r4)                                       ; Save the word
 681                         subi    r4,r4,4                                         ; Bump sink
 682
 683 ;                       Move backend halfword
 684
 685 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
 686                         lhz             r7,-2(r6)                                       ; Get the halfword
 687                         subi    r6,r6,2                                         ; Point to the next
 688                         sth             r7,-2(r4)                                       ; Save the halfword
 689                         subi    r4,r4,2                                         ; Bump sink
 690
 691 ;                       Move backend byte
 692
 693 bnohalf:        bflr    31                          ; Leave cuz we are all done...
 694                         lbz             r7,-1(r6)                                       ; Get the byte
 695                         stb             r7,-1(r4)                                       ; Save the single
 696                         blr
 697
 698
 699 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
 700 // called either in 32 or 64-bit mode, which makes the test for reverse moves
 701 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
 702 // special cases.
 703 //
 704 // When entered:
 705 //              r4 = destination (32 or 64-bit ptr)
 706 //              r5 = length (always 32 bits)
 707 //              r6 = source (32 or 64-bit ptr)
 708 //      r12 = (dest - source), reverse move required if (dest-source)<length
 709 //              cr5 = noncache flag
 710
 711         .align  5
 712 copyit64:
 713         rlwinm  r7,r5,0,0,31        // truncate length to 32-bit, in case we're running in 64-bit mode
 714         cntlzw  r11,r5                          // get magnitude of length
 715         dcbt    0,r6                            // touch in 1st block of source
 716         dcbtst  0,r4                            // touch in 1st destination cache block
 717         subc    r7,r12,r7           // set Carry if (dest-source)>=length, in mode-independent way
 718         li      r0,0                // get a 0
 719         lis     r10,hi16(0x80000000)// get 0x80000000
 720         addze.  r0,r0               // set cr0 on carry bit (beq if reverse move required)
 721         neg     r9,r4               // start to get alignment for destination
 722         sraw    r8,r10,r11          // get mask based on operand length, to limit alignment
 723         bt--    noncache,c64uncached// skip if uncached
 724         beq--   c64rdouble          // handle cached reverse moves
 725
 726
 727 // Forward, cached or doubleword aligned uncached.  This is the common case.
 728 // NOTE: we never do an unaligned access if the source and destination are "relatively"
 729 // doubleword aligned.  We depend on this in the uncached case.
 730 //      r4 = destination
 731 //      r5 = length (>0)
 732 //      r6 = source
 733 //      r8 = inverse of largest mask smaller than operand length
 734 //      r9 = neg(dest), used to compute alignment
 735 //      cr5 = noncache flag
 736
 737 c64double:
 738         rlwinm  r7,r9,0,0x7F        // get #bytes to 128-byte align destination
 739         andc    r7,r7,r8            // limit by operand length
 740         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
 741         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
 742         sub             r5,r5,r7                        // adjust length remaining
 743         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
 744         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
 745         cmpwi   cr7,r10,0                       // set cr7 on chunk count
 746         beq             c64double2                      // dest already doubleword aligned
 747         mtctr   r8
 748         b               c64double1
 749
 750         .align  5                                       // align inner loops
 751 c64double1:                                                     // copy bytes until dest is doubleword aligned
 752         lbz             r0,0(r6)
 753         addi    r6,r6,1
 754         stb             r0,0(r4)
 755         addi    r4,r4,1
 756         bdnz    c64double1
 757
 758 c64double2:                                                     // r9/cr1=doublewords, r10/cr7=128-byte chunks
 759         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
 760         mtctr   r9
 761         b               c64double3
 762
 763         .align  5                                       // align inner loops
 764 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
 765         ld              r7,0(r6)
 766         addi    r6,r6,8
 767         std             r7,0(r4)
 768         addi    r4,r4,8
 769         bdnz    c64double3
 770
 771 // Here to xfer 128-byte chunks, if any.  Since we only have 8 GPRs for
 772 // data (64 bytes), we load/store each twice per 128-byte chunk.
 773
 774 c64double4:                                                     // r10/cr7=128-byte chunks
 775         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
 776         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
 777         beq             cr7,c64double7          // no 128-byte chunks
 778
 779         ; We must check for (source-dest)<128 in a mode-independent way.  If within 128 bytes,
 780         ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
 781
 782         sub             r8,r6,r4                        // r8 <- (source - dest)
 783         rldicr. r0,r8,0,63-7        // zero low 7 bits and check for 0, mode independent
 784         cror    noncache,cr0_eq,noncache        // turn on "noncache" flag if (source-dest)<128
 785         mtctr   r10
 786         b               c64InnerLoop
 787
 788         .align  5                                       // align inner loop
 789 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
 790         ld              r0,0(r6)                        // start pipe: load 1st half-line
 791         ld              r2,8(r6)
 792         ld              r7,16(r6)
 793         ld              r8,24(r6)
 794         ld              r9,32(r6)
 795         ld              r10,40(r6)
 796         ld              r11,48(r6)
 797         ld              r12,56(r6)
 798         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
 799         dcbz128 0,r4                            // avoid prefetch of next cache line
 800 c64InnerLoop1:
 801
 802         std             r0,0(r4)
 803         std             r2,8(r4)
 804         std             r7,16(r4)
 805         std             r8,24(r4)
 806         std             r9,32(r4)
 807         std             r10,40(r4)
 808         std             r11,48(r4)
 809         std             r12,56(r4)
 810
 811         ld              r0,64(r6)                       // load 2nd half of chunk
 812         ld              r2,72(r6)
 813         ld              r7,80(r6)
 814         ld              r8,88(r6)
 815         ld              r9,96(r6)
 816         ld              r10,104(r6)
 817         ld              r11,112(r6)
 818         ld              r12,120(r6)
 819         addi    r6,r6,128
 820
 821         std             r0,64(r4)
 822         std             r2,72(r4)
 823         std             r7,80(r4)
 824         std             r8,88(r4)
 825         std             r9,96(r4)
 826         std             r10,104(r4)
 827         std             r11,112(r4)
 828         std             r12,120(r4)
 829         addi    r4,r4,128                       // advance to next dest chunk
 830
 831         bdnz    c64InnerLoop            // loop if more chunks
 832
 833
 834 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
 835         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
 836         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
 837         beq             cr1,c64byte                     // no leftover doublewords
 838         mtctr   r0
 839         b               c64double8
 840
 841         .align  5                                       // align inner loop
 842 c64double8:                                                     // loop copying leftover doublewords
 843         ld              r0,0(r6)
 844         addi    r6,r6,8
 845         std             r0,0(r4)
 846         addi    r4,r4,8
 847         bdnz    c64double8
 848
 849
 850 // Forward byte loop.
 851
 852 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
 853                 beqlr                       // done if no leftover bytes
 854         mtctr   r5
 855         b               c64byte1
 856
 857         .align  5                                       // align inner loop
 858 c64byte1:
 859         lbz             r0,0(r6)
 860         addi    r6,r6,1
 861         stb             r0,0(r4)
 862         addi    r4,r4,1
 863         bdnz    c64byte1
 864
 865         blr
 866
 867
 868 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
 869 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
 870 // a byte at a time, but that is still much faster than alignment exceptions.
 871 //      r4 = destination
 872 //      r5 = length (>0)
 873 //      r6 = source
 874 //      r8 = inverse of largest mask smaller than operand length
 875 //      r9 = neg(dest), used to compute alignment
 876 //      r12 = (dest-source), used to test relative alignment
 877 //      cr0 = beq if reverse move required
 878 //      cr5 = noncache flag
 879
 880 c64uncached:
 881         rlwinm  r10,r12,0,29,31         // relatively doubleword aligned?
 882         rlwinm  r11,r12,0,30,31         // relatively word aligned?
 883         cmpwi   cr7,r10,0                       // set cr7 beq if doubleword aligned
 884         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
 885         beq--   c64reverseUncached
 886
 887         beq             cr7,c64double           // doubleword aligned
 888         beq             cr1,forward32bit    // word aligned, use G3/G4 code
 889         cmpwi   r5,0                            // set cr0 on byte count
 890         b               c64byte                         // unaligned operands
 891
 892 c64reverseUncached:
 893         beq             cr7,c64rdouble          // doubleword aligned so can use LD/STD
 894         beq             cr1,reverse32bit        // word aligned, use G3/G4 code
 895         add             r6,r6,r5                        // point to (end+1) of source and dest
 896         add             r4,r4,r5
 897         cmpwi   r5,0                            // set cr0 on length
 898         b               c64rbyte                        // copy a byte at a time
 899
 900
 901
 902 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
 903 // aligned uncached copies.
 904 //      r4 = destination
 905 //      r5 = length (>0)
 906 //      r6 = source
 907 //      r8 = inverse of largest mask of low-order 1s smaller than operand length
 908 //      cr5 = noncache flag
 909
 910 c64rdouble:
 911         add             r6,r6,r5                        // point to (end+1) of source and dest
 912         add             r4,r4,r5
 913         rlwinm  r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
 914         andc.   r7,r7,r8            // limit by operand length
 915         sub             r5,r5,r7                        // adjust length
 916         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
 917         cmpwi   cr1,r8,0                        // any chunks?
 918         beq             c64rd2                          // source already doubleword aligned
 919         mtctr   r7
 920
 921 c64rd1:                                                         // copy bytes until source doublword aligned
 922         lbzu    r0,-1(r6)
 923         stbu    r0,-1(r4)
 924         bdnz    c64rd1
 925
 926 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
 927         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
 928         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
 929         cmpwi   cr7,r0,0                        // leftover doublewords?
 930         beq             cr1,c64rd4                      // no chunks to xfer
 931         mtctr   r8
 932         b               c64rd3
 933
 934         .align  5                                       // align inner loop
 935 c64rd3:                                                         // loop copying 64-byte chunks
 936         ld              r7,-8(r6)
 937         ld              r8,-16(r6)
 938         ld              r9,-24(r6)
 939         ld              r10,-32(r6)
 940         ld              r11,-40(r6)
 941         ld              r12,-48(r6)
 942         std             r7,-8(r4)
 943         std             r8,-16(r4)
 944         ld              r7,-56(r6)
 945         ldu             r8,-64(r6)
 946         std             r9,-24(r4)
 947         std             r10,-32(r4)
 948         std             r11,-40(r4)
 949         std             r12,-48(r4)
 950         std             r7,-56(r4)
 951         stdu    r8,-64(r4)
 952         bdnz    c64rd3
 953
 954 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
 955         beq             cr7,c64rbyte            // no leftover doublewords
 956         mtctr   r0
 957
 958 c64rd5:                                                         // loop copying leftover doublewords
 959         ldu             r0,-8(r6)
 960         stdu    r0,-8(r4)
 961         bdnz    c64rd5
 962
 963
 964 // Reverse byte loop.
 965
 966 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
 967         beqlr                       // done if no leftover bytes
 968         mtctr   r5
 969
 970 c64rbyte1:
 971         lbzu    r0,-1(r6)
 972         stbu    r0,-1(r4)
 973         bdnz    c64rbyte1
 974
 975         blr
 976