osfmk/ppc/bcopy.s

   1 /*
   2  * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22 ;
  23 ;                       Copy bytes of data around. Handles overlapped data.
  24 ;
  25 ;
  26 #include <ppc/asm.h>
  27 #include <ppc/proc_reg.h>
  28 #include <assym.s>
  29
  30 ;       These routines use CR5 for certain flags:
  31 ;               Use CR5_lt to indicate non-cached (in bcopy and memcpy)
  32 #define noncache 20
  33
  34
  35 ;       The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
  36 #define BCOPY_SF_SIZE   32      // total size
  37 #define BCOPY_SF_MSR    16      // we save caller's MSR here (possibly minus VEC and FP)
  38
  39
  40 #define kShort  32              // short operands are special cased
  41
  42
  43 ; void bcopy_physvir_32(from, to, nbytes)
  44 ;
  45 ; Attempt to copy physically addressed memory with translation on if conditions are met.
  46 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors
  47 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
  48 ; for the passed phys addrs and do the copy with translation on.
  49 ;
  50 ; Rules are: - neither source nor destination can cross a page.
  51 ;            - Interrupts must be disabled when this routine is called.
  52 ;            - Translation must be on when called.
  53 ;
  54 ; To do the copy, we build a 128 DBAT for both the source and sink.  If both are the same, only one
  55 ; is loaded.  We do not touch the IBATs, so there is no issue if either physical page
  56 ; address is the same as the virtual address of the instructions we are executing.
  57 ;
  58 ; At the end, we invalidate the used DBATs.
  59 ;
  60 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  61 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  62 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  63 ; there anyhow.
  64 ;
  65 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
  66 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
  67 ; for an example of how this is done.
  68
  69                         .align  5
  70                         .globl  EXT(bcopy_physvir_32)
  71
  72 LEXT(bcopy_physvir_32)
  73             mflr    r0                          ; get return address
  74             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  75             mfsprg      r8,2                                            ; get processor feature flags
  76             stw     r0,8(r1)                    ; save return address
  77                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  78             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
  79             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  80             subi    r0,r7,1                     ; get length - 1
  81                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  82                         add             r11,r3,r0                                       ; Point to last byte of sink
  83                         mr              r5,r7                                           ; Get the length into the right register
  84             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  85
  86 ; This test for page overflow may not work if the length is negative.  Negative lengths are invalid input
  87 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
  88
  89                         add             r12,r4,r0                                       ; Point to last byte of source
  90                         xor             r7,r11,r3                                       ; See if we went to next page
  91                         xor             r8,r12,r4                                       ; See if we went to next page
  92                         or              r0,r7,r8                                        ; Combine wrap
  93
  94 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
  95                         li              r9,((2<<3)|2)                           ; Set default attributes
  96                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
  97                         li              r7,2                                            ; Set validity flags
  98                         li              r8,2                                            ; Set validity flags
  99                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
 100
 101                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
 102                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
 103                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
 104                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
 105                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
 106
 107                         sync
 108
 109                         mtdbatl 0,r11                                           ; Set sink lower DBAT
 110                         mtdbatu 0,r7                                            ; Set sink upper DBAT
 111
 112                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
 113
 114                         mtdbatl 1,r12                                           ; Set source lower DBAT
 115                         mtdbatu 1,r8                                            ; Set source upper DBAT
 116
 117 bcpvsame:
 118             sync                                ; wait for the BATs to stabilize
 119             isync
 120
 121             bl      EXT(bcopy)                  ; BATs set up, args in r3-r5, so do the copy with DR on
 122
 123             li          r0,0                                            ; Get set to invalidate upper half of BATs
 124                         sync                                                            ; Make sure all is well
 125                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
 126                         mtdbatu 1,r0                                            ; Clear source upper DBAT
 127                         sync
 128                         isync
 129
 130             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address
 131             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 132             mtlr    r0
 133             blr
 134
 135
 136 ; void bcopy_phys(from, to, nbytes)
 137 ;
 138 ; Turns off data translation before the copy.  This one will not work in user state.
 139 ; This routine is used on 32 and 64-bit machines.
 140 ;
 141 ; Note that the address parameters are long longs.  We will transform these to 64-bit
 142 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
 143 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
 144 ; there anyhow.
 145 ;
 146 ; Also note that you probably will not be happy if either the sink or source spans across the
 147 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code
 148 ; will not check, so be careful.
 149 ;
 150 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
 151 ;       Interrupts may or may not be disabled.
 152
 153                         .align  5
 154                         .globl  EXT(bcopy_phys)
 155
 156 LEXT(bcopy_phys)
 157             mflr    r0                          ; get return address
 158             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 159             stw     r0,8(r1)                    ; save
 160             mfsprg      r8,2                                            ; get processor feature flags
 161             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
 162                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
 163                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 164                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
 165                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
 166                         mr              r5,r7                                           ; Get the length into the right register
 167
 168 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
 169                         mfmsr   r9                                                      ; Get the MSR
 170                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
 171             ori     r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))       ; Add in FP and DR
 172             andc    r9,r9,r6                    ; unconditionally turn DR, VEC, and FP off
 173             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
 174
 175 ; 32-bit CPUs
 176
 177                         mtmsr   r9                                                      ; turn DR, FP, and VEC off
 178                         isync                                                           ; Wait for it
 179
 180             bl      EXT(bcopy)                  ; do the copy with translation off and caching on
 181
 182                         mfmsr   r9                                                      ; Get the MSR
 183             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on (but leave VEC and FP off)
 184             mtmsr   r9                          ; restore msr
 185             isync                               ; wait for it to happen
 186             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 187             mtlr    r0
 188             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 189             blr
 190
 191
 192 ; 64-bit: turn DR off and SF on.
 193
 194 bcopy_phys64:                                                                   ; r9 = MSR with DP, VEC, and FP off
 195             ori     r8,r9,lo16(MASK(MSR_DR))    ; make a copy with DR back on... this is what we return to caller
 196                         srdi    r2,r3,31                                        ; Get a 1 if source is in I/O memory
 197             li          r0,1                                            ; Note - we use this in a couple places below
 198                         srdi    r10,r4,31                                       ; Get a 1 if sink is in I/O memory
 199             std     r8,BCOPY_SF_MSR(r1)         ; save caller's MSR so we remember whether EE was on
 200             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on in MSR we will copy with
 201                         cmpldi  cr0,r2,1                                        ; Is source in I/O memory?
 202                         cmpldi  cr7,r10,1                                       ; Is sink in I/O memory?
 203             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
 204             isync                                                               ; wait for it to happen
 205                         cror    cr7_eq,cr0_eq,cr7_eq            ; See if either source or sink is in I/O area
 206             beq--   cr7,io_space_real_mode_copy ; an operand is in I/O space
 207
 208             bl      EXT(bcopy)                  ; do copy with DR off and SF on, cache enabled
 209
 210 bcopy_phys64x:
 211                         mfmsr   r9                                                      ; Get the MSR we used to copy
 212             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
 213             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on
 214             mtmsrd  r9                          ; turn 64-bit mode off, translation back on
 215             isync                                                               ; wait for it to happen
 216             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
 217             ld      r8,BCOPY_SF_MSR(r1)         ; get caller's MSR once translation is back on
 218             mtlr    r0
 219             mtmsrd  r8,1                        ; turn EE back on if necessary
 220             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
 221             blr
 222
 223 ;   We need to copy with DR off, but one of the operands is in I/O space.  To avoid wedging U3,
 224 ;   which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
 225 ;   This can only be done by setting bits in HID4.  We cannot lose control and execute random code in
 226 ;   this state, so we have to disable interrupts as well.  This is an unpleasant hack.
 227
 228 io_space_real_mode_copy:                        ; r0=1, r9=MSR we want to copy with
 229                         sldi    r11,r0,31-MSR_EE_BIT            ; Get a mask for the EE bit
 230                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 231                         andc    r9,r9,r11                                       ; Turn off EE bit
 232                         mfspr   r2,hid4                                         ; Get HID4
 233                         mtmsrd  r9,1                        ; Force off EE
 234                         or              r2,r2,r0                                        ; Set bit to make real accesses cache-inhibited
 235                         sync                                                            ; Sync up
 236                         mtspr   hid4,r2                                         ; Make real accesses cache-inhibited
 237                         isync                                                           ; Toss prefetches
 238
 239                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 240                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 241                         slbie   r12                                                     ; Make sure the ERAT is cleared
 242
 243                         sync
 244                         isync
 245
 246             bl      EXT(bcopy_nc)               ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
 247
 248                         li              r0,1                                            ; Get a 1
 249                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
 250                         mfspr   r2,hid4                                         ; Get HID4
 251                         andc    r2,r2,r0                                        ; Clear bit to make real accesses cache-inhibited
 252                         sync                                                            ; Sync up
 253                         mtspr   hid4,r2                                         ; Make real accesses not cache-inhibited
 254                         isync                                                           ; Toss prefetches
 255
 256                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
 257                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
 258                         slbie   r12                                                     ; Make sure the ERAT is cleared
 259             b       bcopy_phys64x
 260
 261
 262 ;
 263 ; shortcopy
 264 ;
 265 ; Special case short operands (<32 bytes), which are very common.  Note that the check for
 266 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
 267 ; reverse when it wasn't necessary to do so.  This is OK, since performance of the two cases
 268 ; is similar.  We do get the direction right when it counts (ie, when the operands overlap.)
 269 ; Also note that we use the G3/G4 "backend" code, even on G5.  This is OK too, since G5 has
 270 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
 271 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
 272 ; which cost about 20 cycles if they cross a 32-byte boundary on G5.  Finally, because we
 273 ; might do unaligned accesses this code cannot be called from bcopy_nc().
 274 ;           r4 = destination
 275 ;           r5 = length (<32)
 276 ;           r6 = source
 277 ;           r12 = (dest - source)
 278
 279             .align  5
 280 shortcopy:
 281             cmplw   r12,r5                      ; must move reverse if (dest-source)<length
 282             mtcrf   2,r5                        ; move length to cr6 and cr7 one at a time...
 283             mtcrf   1,r5                        ; ...which is faster on G4 and G5
 284             bge++   backend                     ; handle forward moves (most common case)
 285             add     r6,r6,r5                    ; point one past end of operands in reverse moves
 286             add     r4,r4,r5
 287             b       bbackend                    ; handle reverse moves
 288
 289 ;
 290 ; void bcopy(from, to, nbytes)
 291 ;
 292 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
 293 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
 294 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
 295 ; to the thread_recover routine.  What this means is that it would be hard to use vector or floating point
 296 ; registers to accelerate the copy.
 297 ;
 298 ; NOTE: this code can be called in any of three "modes":
 299 ;       - on 32-bit processors (32-byte cache line)
 300 ;       - on 64-bit processors running in 32-bit mode (128-byte cache line)
 301 ;       - on 64-bit processors running in 64-bit mode (128-byte cache line)
 302
 303                         .align  5
 304                         .globl  EXT(bcopy)
 305             .globl  EXT(bcopy_nop_if_32bit)
 306
 307 LEXT(bcopy)
 308                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 309             sub.    r12,r4,r3                                   ; test for to==from in mode-independent way, start fwd/rev check
 310                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 311                         blt     cr1,shortcopy               ; special case short operands
 312                         crclr   noncache                                        ; Set cached
 313 LEXT(bcopy_nop_if_32bit)
 314             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 315                         bne+    copyit32                                        ; handle 32-bit processor
 316             blr                                 ; to==from so nothing to do
 317
 318 ;
 319 ; bcopy_nc(from, to, nbytes)
 320 ;
 321 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
 322 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
 323 ; alignment exceptions.  Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
 324 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
 325
 326                         .align  5
 327                         .globl  EXT(bcopy_nc)
 328             .globl  EXT(bcopy_nc_nop_if_32bit)
 329
 330 LEXT(bcopy_nc)
 331                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
 332             sub.        r12,r4,r3                                       ; test for to==from in mode-independent way, start fwd/rev check
 333                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
 334                         crset   noncache                                        ; Set non-cached
 335                         cror    cr0_eq,cr1_eq,cr0_eq        ; set cr0 beq if either length zero or to==from
 336 LEXT(bcopy_nc_nop_if_32bit)
 337             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 338                         bne+    copyit32                                        ; handle 32-bit processor
 339             blr                                 ; either zero length or to==from
 340
 341 ;
 342 ; void* memcpy(to, from, nbytes)
 343 ; void* memmove(to, from, nbytes)
 344 ;
 345 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
 346 ; However, they would work correctly if called in 64-bit mode.
 347
 348                         .align  5
 349                         .globl  EXT(memcpy)
 350                         .globl  EXT(memmove)
 351             .globl  EXT(memcpy_nop_if_32bit)
 352
 353 LEXT(memcpy)
 354 LEXT(memmove)
 355                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
 356             sub.    r12,r3,r4                                   ; test for to==from in mode-independent way, start fwd/rev check
 357                         mr              r6,r4                                           ; Set source
 358                         mr              r4,r3                                           ; Set the "to" (must preserve r3 for return value)
 359                         blt     cr1,shortcopy               ; special case short operands
 360                         crclr   noncache                                        ; Set cached
 361 LEXT(memcpy_nop_if_32bit)
 362             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
 363                         beqlr-                              ; exit if to==from
 364
 365
 366 ;       Here to copy on 32-bit processors.
 367 ;
 368 ;                       When we move the memory, forward overlays must be handled.  We
 369 ;                       also can not use the cache instructions if we are from bcopy_nc.
 370 ;                       We need to preserve R3 because it needs to be returned for memcpy.
 371 ;                       We can be interrupted and lose control here.
 372 ;
 373 ;           When entered:
 374 ;               r4 = destination
 375 ;               r5 = length (>0)
 376 ;               r6 = source
 377 ;               r12 = (dest - source)
 378 ;               cr5 = noncache flag
 379
 380 copyit32:                                       ; WARNING! can drop down to this label
 381             cmplw   cr1,r12,r5                  ; must move reverse if (dest-source)<length
 382             cntlzw  r11,r5                      ; get magnitude of length
 383             dcbt    0,r6                        ; start to touch in source
 384             lis     r10,hi16(0x80000000)        ; get 0x80000000
 385             neg     r9,r4                       ; start to get alignment for destination
 386             dcbtst  0,r4                        ; start to touch in destination
 387             sraw    r8,r10,r11                  ; get mask based on operand length, to limit alignment
 388             blt-    cr1,reverse32bit            ; reverse move required
 389
 390 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
 391 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 392 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 393 ;               r4 = destination
 394 ;               r5 = length (>0)
 395 ;               r6 = source
 396 ;               r8 = inverse of largest mask smaller than operand length
 397 ;               r9 = neg(dest), used to compute alignment
 398 ;               cr5 = noncache flag
 399
 400 forward32bit:                                   ; enter from 64-bit CPUs with word aligned uncached operands
 401                         rlwinm  r7,r9,0,0x1F                            ; get bytes to 32-byte-align destination
 402                         andc.   r0,r7,r8                                        ; limit to the maximum front end move
 403             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 404                         beq             alline                                          ; Already on a line...
 405
 406                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 407                         sub             r5,r5,r0                                        ; Set the length left to move
 408
 409                         bf              31,alhalf                                       ; No single byte to do...
 410                         lbz             r7,0(r6)                                        ; Get the byte
 411                         addi    r6,r6,1                                         ; Point to the next
 412                         stb             r7,0(r4)                                        ; Save the single
 413                         addi    r4,r4,1                                         ; Bump sink
 414
 415 ;                       Sink is halfword aligned here
 416
 417 alhalf:         bf              30,alword                                       ; No halfword to do...
 418                         lhz             r7,0(r6)                                        ; Get the halfword
 419                         addi    r6,r6,2                                         ; Point to the next
 420                         sth             r7,0(r4)                                        ; Save the halfword
 421                         addi    r4,r4,2                                         ; Bump sink
 422
 423 ;                       Sink is word aligned here
 424
 425 alword:         bf              29,aldouble                                     ; No word to do...
 426                         lwz             r7,0(r6)                                        ; Get the word
 427                         addi    r6,r6,4                                         ; Point to the next
 428                         stw             r7,0(r4)                                        ; Save the word
 429                         addi    r4,r4,4                                         ; Bump sink
 430
 431 ;                       Sink is double aligned here
 432
 433 aldouble:       bf              28,alquad                                       ; No double to do...
 434                         lwz             r7,0(r6)                                        ; Get the first word
 435                         lwz             r8,4(r6)                                        ; Get the second word
 436                         addi    r6,r6,8                                         ; Point to the next
 437                         stw             r7,0(r4)                                        ; Save the first word
 438                         stw             r8,4(r4)                                        ; Save the second word
 439                         addi    r4,r4,8                                         ; Bump sink
 440
 441 ;                       Sink is quadword aligned here
 442
 443 alquad:         bf              27,alline                                       ; No quad to do...
 444                         lwz             r7,0(r6)                                        ; Get the first word
 445                         lwz             r8,4(r6)                                        ; Get the second word
 446                         lwz             r9,8(r6)                                        ; Get the third word
 447                         stw             r7,0(r4)                                        ; Save the first word
 448                         lwz             r11,12(r6)                                      ; Get the fourth word
 449                         addi    r6,r6,16                                        ; Point to the next
 450                         stw             r8,4(r4)                                        ; Save the second word
 451                         stw             r9,8(r4)                                        ; Save the third word
 452                         stw             r11,12(r4)                                      ; Save the fourth word
 453                         addi    r4,r4,16                                        ; Bump sink
 454
 455 ;                       Sink is line aligned here
 456
 457 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 458             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 459                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 460                         beq-    backend                                         ; No full lines to move
 461
 462             mtctr   r0                          ; set up loop count
 463                         li              r0,96                                           ; Stride for touch ahead
 464             b       nxtline
 465
 466             .align  4
 467 nxtline:
 468             lwz         r2,0(r6)                                        ; Get the first word
 469                         lwz             r5,4(r6)                                        ; Get the second word
 470                         lwz             r7,8(r6)                                        ; Get the third word
 471                         lwz             r8,12(r6)                                       ; Get the fourth word
 472                         lwz             r9,16(r6)                                       ; Get the fifth word
 473                         lwz             r10,20(r6)                                      ; Get the sixth word
 474                         lwz             r11,24(r6)                                      ; Get the seventh word
 475                         lwz             r12,28(r6)                                      ; Get the eighth word
 476                         bt-             noncache,skipz                          ; Skip if we are not cached...
 477                         dcbz    0,r4                                            ; Blow away the whole line because we are replacing it
 478                         dcbt    r6,r0                                           ; Touch ahead a bit
 479 skipz:
 480                         addi    r6,r6,32                                        ; Point to the next
 481                         stw             r2,0(r4)                                        ; Save the first word
 482                         stw             r5,4(r4)                                        ; Save the second word
 483                         stw             r7,8(r4)                                        ; Save the third word
 484                         stw             r8,12(r4)                                       ; Save the fourth word
 485                         stw             r9,16(r4)                                       ; Save the fifth word
 486                         stw             r10,20(r4)                                      ; Save the sixth word
 487                         stw             r11,24(r4)                                      ; Save the seventh word
 488                         stw             r12,28(r4)                                      ; Save the eighth word
 489                         addi    r4,r4,32                                        ; Bump sink
 490                         bdnz+   nxtline                                         ; Do the next line, if any...
 491
 492
 493 ;                       Move backend quadword
 494
 495 backend:                                        ; Join here from "shortcopy" for forward moves <32 bytes
 496             bf          27,noquad                                       ; No quad to do...
 497                         lwz             r7,0(r6)                                        ; Get the first word
 498                         lwz             r8,4(r6)                                        ; Get the second word
 499                         lwz             r9,8(r6)                                        ; Get the third word
 500                         lwz             r11,12(r6)                                      ; Get the fourth word
 501                         stw             r7,0(r4)                                        ; Save the first word
 502                         addi    r6,r6,16                                        ; Point to the next
 503                         stw             r8,4(r4)                                        ; Save the second word
 504                         stw             r9,8(r4)                                        ; Save the third word
 505                         stw             r11,12(r4)                                      ; Save the fourth word
 506                         addi    r4,r4,16                                        ; Bump sink
 507
 508 ;                       Move backend double
 509
 510 noquad:         bf              28,nodouble                                     ; No double to do...
 511                         lwz             r7,0(r6)                                        ; Get the first word
 512                         lwz             r8,4(r6)                                        ; Get the second word
 513                         addi    r6,r6,8                                         ; Point to the next
 514                         stw             r7,0(r4)                                        ; Save the first word
 515                         stw             r8,4(r4)                                        ; Save the second word
 516                         addi    r4,r4,8                                         ; Bump sink
 517
 518 ;                       Move backend word
 519
 520 nodouble:       bf              29,noword                                       ; No word to do...
 521                         lwz             r7,0(r6)                                        ; Get the word
 522                         addi    r6,r6,4                                         ; Point to the next
 523                         stw             r7,0(r4)                                        ; Save the word
 524                         addi    r4,r4,4                                         ; Bump sink
 525
 526 ;                       Move backend halfword
 527
 528 noword:         bf              30,nohalf                                       ; No halfword to do...
 529                         lhz             r7,0(r6)                                        ; Get the halfword
 530                         addi    r6,r6,2                                         ; Point to the next
 531                         sth             r7,0(r4)                                        ; Save the halfword
 532                         addi    r4,r4,2                                         ; Bump sink
 533
 534 ;                       Move backend byte
 535
 536 nohalf:         bflr    31                          ; Leave cuz we are all done...
 537                         lbz             r7,0(r6)                                        ; Get the byte
 538                         stb             r7,0(r4)                                        ; Save the single
 539             blr
 540
 541
 542 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
 543 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
 544 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
 545 ; These are slower because we don't bother with dcbz.  Fortunately, reverse moves are uncommon.
 546 ;               r4 = destination
 547 ;               r5 = length (>0)
 548 ;               r6 = source
 549 ;               r8 = inverse of largest mask smaller than operand length
 550 ;               cr5 = noncache flag (but we don't dcbz anyway)
 551
 552 reverse32bit:                                                                   ; here from 64-bit code with word aligned uncached operands
 553             add         r4,r5,r4                                        ; Point past the last sink byte
 554                         add             r6,r5,r6                                        ; Point past the last source byte
 555                         rlwinm  r7,r4,0,0x1F                            ; Calculate the length to align dest on cache boundary
 556                         li              r12,-1                                          ; Make sure we touch in the actual line
 557                         andc.   r0,r7,r8                                        ; Apply movement limit
 558                         dcbt    r12,r6                                          ; Touch in the last line of source
 559             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
 560                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
 561                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
 562                         beq-    balline                                         ; Aready on cache line boundary (or too short to bother)
 563
 564                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
 565
 566                         bf              31,balhalf                                      ; No single byte to do...
 567                         lbz             r7,-1(r6)                                       ; Get the byte
 568                         subi    r6,r6,1                                         ; Point to the next
 569                         stb             r7,-1(r4)                                       ; Save the single
 570                         subi    r4,r4,1                                         ; Bump sink
 571
 572 ;                       Sink is halfword aligned here
 573
 574 balhalf:        bf              30,balword                                      ; No halfword to do...
 575                         lhz             r7,-2(r6)                                       ; Get the halfword
 576                         subi    r6,r6,2                                         ; Point to the next
 577                         sth             r7,-2(r4)                                       ; Save the halfword
 578                         subi    r4,r4,2                                         ; Bump sink
 579
 580 ;                       Sink is word aligned here
 581
 582 balword:        bf              29,baldouble                            ; No word to do...
 583                         lwz             r7,-4(r6)                                       ; Get the word
 584                         subi    r6,r6,4                                         ; Point to the next
 585                         stw             r7,-4(r4)                                       ; Save the word
 586                         subi    r4,r4,4                                         ; Bump sink
 587
 588 ;                       Sink is double aligned here
 589
 590 baldouble:      bf              28,balquad                                      ; No double to do...
 591                         lwz             r7,-8(r6)                                       ; Get the first word
 592                         lwz             r8,-4(r6)                                       ; Get the second word
 593                         subi    r6,r6,8                                         ; Point to the next
 594                         stw             r7,-8(r4)                                       ; Save the first word
 595                         stw             r8,-4(r4)                                       ; Save the second word
 596                         subi    r4,r4,8                                         ; Bump sink
 597
 598 ;                       Sink is quadword aligned here
 599
 600 balquad:        bf              27,balline                                      ; No quad to do...
 601                         lwz             r7,-16(r6)                                      ; Get the first word
 602                         lwz             r8,-12(r6)                                      ; Get the second word
 603                         lwz             r9,-8(r6)                                       ; Get the third word
 604                         lwz             r11,-4(r6)                                      ; Get the fourth word
 605                         stw             r7,-16(r4)                                      ; Save the first word
 606                         subi    r6,r6,16                                        ; Point to the next
 607                         stw             r8,-12(r4)                                      ; Save the second word
 608                         stw             r9,-8(r4)                                       ; Save the third word
 609                         stw             r11,-4(r4)                                      ; Save the fourth word
 610                         subi    r4,r4,16                                        ; Bump sink
 611
 612 ;                       Sink is line aligned here
 613
 614 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 615             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
 616                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5
 617                         beq-    bbackend                                        ; No full lines to move
 618             mtctr   r0                          ; set up loop count
 619             b       bnxtline
 620
 621             .align  4
 622 bnxtline:
 623                         lwz             r7,-32(r6)                                      ; Get the first word
 624                         lwz             r5,-28(r6)                                      ; Get the second word
 625                         lwz             r2,-24(r6)                                      ; Get the third word
 626                         lwz             r12,-20(r6)                                     ; Get the third word
 627                         lwz             r11,-16(r6)                                     ; Get the fifth word
 628                         lwz             r10,-12(r6)                                     ; Get the sixth word
 629                         lwz             r9,-8(r6)                                       ; Get the seventh word
 630                         lwz             r8,-4(r6)                                       ; Get the eighth word
 631                         subi    r6,r6,32                                        ; Point to the next
 632
 633                         stw             r7,-32(r4)                                      ; Get the first word
 634             stw         r5,-28(r4)                                      ; Get the second word
 635                         stw             r2,-24(r4)                                      ; Get the third word
 636                         stw             r12,-20(r4)                                     ; Get the third word
 637                         stw             r11,-16(r4)                                     ; Get the fifth word
 638                         stw             r10,-12(r4)                                     ; Get the sixth word
 639                         stw             r9,-8(r4)                                       ; Get the seventh word
 640                         stw             r8,-4(r4)                                       ; Get the eighth word
 641                         subi    r4,r4,32                                        ; Bump sink
 642
 643                         bdnz+   bnxtline                                        ; Do the next line, if any...
 644
 645 ;
 646 ;                       Note: We touched these lines in at the beginning
 647 ;
 648
 649 ;                       Move backend quadword
 650
 651 bbackend:                                       ; Join here from "shortcopy" for reverse moves of <32 bytes
 652             bf          27,bnoquad                                      ; No quad to do...
 653                         lwz             r7,-16(r6)                                      ; Get the first word
 654                         lwz             r8,-12(r6)                                      ; Get the second word
 655                         lwz             r9,-8(r6)                                       ; Get the third word
 656                         lwz             r11,-4(r6)                                      ; Get the fourth word
 657                         stw             r7,-16(r4)                                      ; Save the first word
 658                         subi    r6,r6,16                                        ; Point to the next
 659                         stw             r8,-12(r4)                                      ; Save the second word
 660                         stw             r9,-8(r4)                                       ; Save the third word
 661                         stw             r11,-4(r4)                                      ; Save the fourth word
 662                         subi    r4,r4,16                                        ; Bump sink
 663
 664 ;                       Move backend double
 665
 666 bnoquad:        bf              28,bnodouble                            ; No double to do...
 667                         lwz             r7,-8(r6)                                       ; Get the first word
 668                         lwz             r8,-4(r6)                                       ; Get the second word
 669                         subi    r6,r6,8                                         ; Point to the next
 670                         stw             r7,-8(r4)                                       ; Save the first word
 671                         stw             r8,-4(r4)                                       ; Save the second word
 672                         subi    r4,r4,8                                         ; Bump sink
 673
 674 ;                       Move backend word
 675
 676 bnodouble:      bf              29,bnoword                                      ; No word to do...
 677                         lwz             r7,-4(r6)                                       ; Get the word
 678                         subi    r6,r6,4                                         ; Point to the next
 679                         stw             r7,-4(r4)                                       ; Save the word
 680                         subi    r4,r4,4                                         ; Bump sink
 681
 682 ;                       Move backend halfword
 683
 684 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
 685                         lhz             r7,-2(r6)                                       ; Get the halfword
 686                         subi    r6,r6,2                                         ; Point to the next
 687                         sth             r7,-2(r4)                                       ; Save the halfword
 688                         subi    r4,r4,2                                         ; Bump sink
 689
 690 ;                       Move backend byte
 691
 692 bnohalf:        bflr    31                          ; Leave cuz we are all done...
 693                         lbz             r7,-1(r6)                                       ; Get the byte
 694                         stb             r7,-1(r4)                                       ; Save the single
 695                         blr
 696
 697
 698 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
 699 // called either in 32 or 64-bit mode, which makes the test for reverse moves
 700 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
 701 // special cases.
 702 //
 703 // When entered:
 704 //              r4 = destination (32 or 64-bit ptr)
 705 //              r5 = length (always 32 bits)
 706 //              r6 = source (32 or 64-bit ptr)
 707 //      r12 = (dest - source), reverse move required if (dest-source)<length
 708 //              cr5 = noncache flag
 709
 710         .align  5
 711 copyit64:
 712         rlwinm  r7,r5,0,0,31        // truncate length to 32-bit, in case we're running in 64-bit mode
 713         cntlzw  r11,r5                          // get magnitude of length
 714         dcbt    0,r6                            // touch in 1st block of source
 715         dcbtst  0,r4                            // touch in 1st destination cache block
 716         subc    r7,r12,r7           // set Carry if (dest-source)>=length, in mode-independent way
 717         li      r0,0                // get a 0
 718         lis     r10,hi16(0x80000000)// get 0x80000000
 719         addze.  r0,r0               // set cr0 on carry bit (beq if reverse move required)
 720         neg     r9,r4               // start to get alignment for destination
 721         sraw    r8,r10,r11          // get mask based on operand length, to limit alignment
 722         bt--    noncache,c64uncached// skip if uncached
 723         beq--   c64rdouble          // handle cached reverse moves
 724
 725
 726 // Forward, cached or doubleword aligned uncached.  This is the common case.
 727 // NOTE: we never do an unaligned access if the source and destination are "relatively"
 728 // doubleword aligned.  We depend on this in the uncached case.
 729 //      r4 = destination
 730 //      r5 = length (>0)
 731 //      r6 = source
 732 //      r8 = inverse of largest mask smaller than operand length
 733 //      r9 = neg(dest), used to compute alignment
 734 //      cr5 = noncache flag
 735
 736 c64double:
 737         rlwinm  r7,r9,0,0x7F        // get #bytes to 128-byte align destination
 738         andc    r7,r7,r8            // limit by operand length
 739         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
 740         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
 741         sub             r5,r5,r7                        // adjust length remaining
 742         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
 743         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
 744         cmpwi   cr7,r10,0                       // set cr7 on chunk count
 745         beq             c64double2                      // dest already doubleword aligned
 746         mtctr   r8
 747         b               c64double1
 748
 749         .align  5                                       // align inner loops
 750 c64double1:                                                     // copy bytes until dest is doubleword aligned
 751         lbz             r0,0(r6)
 752         addi    r6,r6,1
 753         stb             r0,0(r4)
 754         addi    r4,r4,1
 755         bdnz    c64double1
 756
 757 c64double2:                                                     // r9/cr1=doublewords, r10/cr7=128-byte chunks
 758         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
 759         mtctr   r9
 760         b               c64double3
 761
 762         .align  5                                       // align inner loops
 763 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
 764         ld              r7,0(r6)
 765         addi    r6,r6,8
 766         std             r7,0(r4)
 767         addi    r4,r4,8
 768         bdnz    c64double3
 769
 770 // Here to xfer 128-byte chunks, if any.  Since we only have 8 GPRs for
 771 // data (64 bytes), we load/store each twice per 128-byte chunk.
 772
 773 c64double4:                                                     // r10/cr7=128-byte chunks
 774         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
 775         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
 776         beq             cr7,c64double7          // no 128-byte chunks
 777
 778         ; We must check for (source-dest)<128 in a mode-independent way.  If within 128 bytes,
 779         ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
 780
 781         sub             r8,r6,r4                        // r8 <- (source - dest)
 782         rldicr. r0,r8,0,63-7        // zero low 7 bits and check for 0, mode independent
 783         cror    noncache,cr0_eq,noncache        // turn on "noncache" flag if (source-dest)<128
 784         mtctr   r10
 785         b               c64InnerLoop
 786
 787         .align  5                                       // align inner loop
 788 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
 789         ld              r0,0(r6)                        // start pipe: load 1st half-line
 790         ld              r2,8(r6)
 791         ld              r7,16(r6)
 792         ld              r8,24(r6)
 793         ld              r9,32(r6)
 794         ld              r10,40(r6)
 795         ld              r11,48(r6)
 796         ld              r12,56(r6)
 797         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
 798         dcbz128 0,r4                            // avoid prefetch of next cache line
 799 c64InnerLoop1:
 800
 801         std             r0,0(r4)
 802         std             r2,8(r4)
 803         std             r7,16(r4)
 804         std             r8,24(r4)
 805         std             r9,32(r4)
 806         std             r10,40(r4)
 807         std             r11,48(r4)
 808         std             r12,56(r4)
 809
 810         ld              r0,64(r6)                       // load 2nd half of chunk
 811         ld              r2,72(r6)
 812         ld              r7,80(r6)
 813         ld              r8,88(r6)
 814         ld              r9,96(r6)
 815         ld              r10,104(r6)
 816         ld              r11,112(r6)
 817         ld              r12,120(r6)
 818         addi    r6,r6,128
 819
 820         std             r0,64(r4)
 821         std             r2,72(r4)
 822         std             r7,80(r4)
 823         std             r8,88(r4)
 824         std             r9,96(r4)
 825         std             r10,104(r4)
 826         std             r11,112(r4)
 827         std             r12,120(r4)
 828         addi    r4,r4,128                       // advance to next dest chunk
 829
 830         bdnz    c64InnerLoop            // loop if more chunks
 831
 832
 833 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
 834         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
 835         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
 836         beq             cr1,c64byte                     // no leftover doublewords
 837         mtctr   r0
 838         b               c64double8
 839
 840         .align  5                                       // align inner loop
 841 c64double8:                                                     // loop copying leftover doublewords
 842         ld              r0,0(r6)
 843         addi    r6,r6,8
 844         std             r0,0(r4)
 845         addi    r4,r4,8
 846         bdnz    c64double8
 847
 848
 849 // Forward byte loop.
 850
 851 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
 852                 beqlr                       // done if no leftover bytes
 853         mtctr   r5
 854         b               c64byte1
 855
 856         .align  5                                       // align inner loop
 857 c64byte1:
 858         lbz             r0,0(r6)
 859         addi    r6,r6,1
 860         stb             r0,0(r4)
 861         addi    r4,r4,1
 862         bdnz    c64byte1
 863
 864         blr
 865
 866
 867 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
 868 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
 869 // a byte at a time, but that is still much faster than alignment exceptions.
 870 //      r4 = destination
 871 //      r5 = length (>0)
 872 //      r6 = source
 873 //      r8 = inverse of largest mask smaller than operand length
 874 //      r9 = neg(dest), used to compute alignment
 875 //      r12 = (dest-source), used to test relative alignment
 876 //      cr0 = beq if reverse move required
 877 //      cr5 = noncache flag
 878
 879 c64uncached:
 880         rlwinm  r10,r12,0,29,31         // relatively doubleword aligned?
 881         rlwinm  r11,r12,0,30,31         // relatively word aligned?
 882         cmpwi   cr7,r10,0                       // set cr7 beq if doubleword aligned
 883         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
 884         beq--   c64reverseUncached
 885
 886         beq             cr7,c64double           // doubleword aligned
 887         beq             cr1,forward32bit    // word aligned, use G3/G4 code
 888         cmpwi   r5,0                            // set cr0 on byte count
 889         b               c64byte                         // unaligned operands
 890
 891 c64reverseUncached:
 892         beq             cr7,c64rdouble          // doubleword aligned so can use LD/STD
 893         beq             cr1,reverse32bit        // word aligned, use G3/G4 code
 894         add             r6,r6,r5                        // point to (end+1) of source and dest
 895         add             r4,r4,r5
 896         cmpwi   r5,0                            // set cr0 on length
 897         b               c64rbyte                        // copy a byte at a time
 898
 899
 900
 901 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
 902 // aligned uncached copies.
 903 //      r4 = destination
 904 //      r5 = length (>0)
 905 //      r6 = source
 906 //      r8 = inverse of largest mask of low-order 1s smaller than operand length
 907 //      cr5 = noncache flag
 908
 909 c64rdouble:
 910         add             r6,r6,r5                        // point to (end+1) of source and dest
 911         add             r4,r4,r5
 912         rlwinm  r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
 913         andc.   r7,r7,r8            // limit by operand length
 914         sub             r5,r5,r7                        // adjust length
 915         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
 916         cmpwi   cr1,r8,0                        // any chunks?
 917         beq             c64rd2                          // source already doubleword aligned
 918         mtctr   r7
 919
 920 c64rd1:                                                         // copy bytes until source doublword aligned
 921         lbzu    r0,-1(r6)
 922         stbu    r0,-1(r4)
 923         bdnz    c64rd1
 924
 925 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
 926         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
 927         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
 928         cmpwi   cr7,r0,0                        // leftover doublewords?
 929         beq             cr1,c64rd4                      // no chunks to xfer
 930         mtctr   r8
 931         b               c64rd3
 932
 933         .align  5                                       // align inner loop
 934 c64rd3:                                                         // loop copying 64-byte chunks
 935         ld              r7,-8(r6)
 936         ld              r8,-16(r6)
 937         ld              r9,-24(r6)
 938         ld              r10,-32(r6)
 939         ld              r11,-40(r6)
 940         ld              r12,-48(r6)
 941         std             r7,-8(r4)
 942         std             r8,-16(r4)
 943         ld              r7,-56(r6)
 944         ldu             r8,-64(r6)
 945         std             r9,-24(r4)
 946         std             r10,-32(r4)
 947         std             r11,-40(r4)
 948         std             r12,-48(r4)
 949         std             r7,-56(r4)
 950         stdu    r8,-64(r4)
 951         bdnz    c64rd3
 952
 953 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
 954         beq             cr7,c64rbyte            // no leftover doublewords
 955         mtctr   r0
 956
 957 c64rd5:                                                         // loop copying leftover doublewords
 958         ldu             r0,-8(r6)
 959         stdu    r0,-8(r4)
 960         bdnz    c64rd5
 961
 962
 963 // Reverse byte loop.
 964
 965 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
 966         beqlr                       // done if no leftover bytes
 967         mtctr   r5
 968
 969 c64rbyte1:
 970         lbzu    r0,-1(r6)
 971         stbu    r0,-1(r4)
 972         bdnz    c64rbyte1
 973
 974         blr
 975