osfmk/ppc/bcopy.s

   1 /*
   2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
   7  *
   8  * This file contains Original Code and/or Modifications of Original Code
   9  * as defined in and that are subject to the Apple Public Source License
  10  * Version 2.0 (the 'License'). You may not use this file except in
  11  * compliance with the License. Please obtain a copy of the License at
  12  * http://www.opensource.apple.com/apsl/ and read it before using this
  13  * file.
  14  *
  15  * The Original Code and all software distributed under the License are
  16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  20  * Please see the License for the specific language governing rights and
  21  * limitations under the License.
  22  *
  23  * @APPLE_LICENSE_HEADER_END@
  24  */
  25 ;
  26 ;                       Copy bytes of data around. handles overlapped data.
  27 ;
  28 ;                       Change this to use Altivec later on, and maybe floating point.
  29 ;
  30 ;
  31 #include <ppc/asm.h>
  32 #include <ppc/proc_reg.h>
  33 #include <assym.s>
  34
  35 ;               Use CR5_lt to indicate non-cached
  36 #define noncache 20
  37
  38 ;               Use CR5_gt to indicate that we need to turn data translation back on
  39 #define fixxlate 21
  40
  41 ;               Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
  42 ;               64-bit mode (if 64-bit) before returning to our caller.  We overload the
  43 ;               bit to reduce the number of conditional branches at bcopy exit.
  44 #define restorex 22
  45
  46 ;               Use CR5_so to indicate that we need to restore real-mode cachability
  47 ;               Only needed on 64-bit machines
  48 #define flipcache 23
  49
  50 ;
  51 ; bcopy_nc(from, to, nbytes)
  52 ;
  53 ; bcopy_nc operates on non-cached memory so we can not use any kind
  54 ; of cache instructions.
  55 ;
  56
  57                         .align  5
  58                         .globl  EXT(bcopy_nc)
  59
  60 LEXT(bcopy_nc)
  61
  62                         crset   noncache                                        ; Set non-cached
  63                         b               bcpswap
  64
  65 ;
  66 ; void bcopy_physvir(from, to, nbytes)
  67 ; Attempt to copy physically addressed memory with translation on if conditions are met.
  68 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors
  69 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
  70 ; for the passed phys addrs and do the copy with translation on.
  71 ;
  72 ; Rules are: neither source nor destination can cross a page.
  73 ;
  74 ; Interrupts must be disabled throughout the copy when this is called.
  75 ; To do this, we build a
  76 ; 128 DBAT for both the source and sink.  If both are the same, only one is
  77 ; loaded.  We do not touch the IBATs, so there is no issue if either physical page
  78 ; address is the same as the virtual address of the instructions we are executing.
  79 ;
  80 ; At the end, we invalidate the used DBATs.
  81 ;
  82 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  83 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  84 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  85 ; there anyhow.
  86 ;
  87 ; Note, this one will not work in user state
  88 ;
  89
  90                         .align  5
  91                         .globl  EXT(bcopy_physvir)
  92
  93 LEXT(bcopy_physvir)
  94
  95                         crclr   flipcache                                       ; (HACK) No cache flip needed
  96             mfsprg      r8,2                                            ; get processor feature flags
  97             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  98                         addic.  r0,r7,-1                                        ; Get length - 1
  99                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
 100                         add             r11,r3,r0                                       ; Point to last byte of sink
 101                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 102             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
 103             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
 104                         mr              r5,r7                                           ; Get the length into the right register
 105                         cmplw   cr1,r3,r4                                       ; Does source == sink?
 106             bt++        pf64Bitb,bcopy_phys1            ; if 64-bit processor, use standard routine (no BATs)
 107                         add             r12,r4,r0                                       ; Point to last byte of source
 108                         bltlr-                                                          ; Bail if length is 0 or way too big
 109                         xor             r7,r11,r3                                       ; See if we went to next page
 110                         xor             r8,r12,r4                                       ; See if we went to next page
 111                         or              r0,r7,r8                                        ; Combine wrap
 112
 113 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
 114                         li              r9,((2<<3)|2)                           ; Set default attributes
 115                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
 116                         li              r7,2                                            ; Set validity flags
 117                         li              r8,2                                            ; Set validity flags
 118                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
 119
 120                         crset   restorex                                        ; Remember to trash BATs on the way out
 121                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
 122                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
 123                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
 124                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
 125                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
 126
 127                         sync
 128
 129                         mtdbatl 0,r11                                           ; Set sink lower DBAT
 130                         mtdbatu 0,r7                                            ; Set sink upper DBAT
 131
 132                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
 133
 134                         mtdbatl 1,r12                                           ; Set source lower DBAT
 135                         mtdbatu 1,r8                                            ; Set source upper DBAT
 136
 137 bcpvsame:       mr              r6,r3                                           ; Set source
 138                         crclr   noncache                                        ; Set cached
 139                         crclr   fixxlate                                        ; Set translation already ok
 140
 141                         b               copyit32                                        ; Go copy it...
 142
 143 ;
 144 ; void bcopy_phys(from, to, nbytes)
 145 ; Turns off data translation before the copy.  Note, this one will
 146 ; not work in user state.  This routine is used on 32 and 64-bit
 147 ; machines.
 148 ;
 149 ; Note that the address parameters are long longs.  We will transform these to 64-bit
 150 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
 151 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
 152 ; there anyhow.
 153 ;
 154 ; Also note that you probably will not be happy if either the sink or source spans across the
 155 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code
 156 ; will not check, so be careful.
 157 ;
 158
 159                         .align  5
 160                         .globl  EXT(bcopy_phys)
 161
 162 LEXT(bcopy_phys)
 163                         crclr   flipcache                                       ; (HACK) No cache flip needed
 164             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 165             mfsprg      r8,2                                            ; get processor feature flags
 166                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
 167                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
 168                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
 169                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
 170                         mr              r5,r7                                           ; Get the length into the right register
 171
 172 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit already in cr6
 173                         mfmsr   r9                                                      ; Get the MSR
 174                         crclr   noncache                                        ; Set cached
 175             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
 176
 177 ; 32-bit CPUs
 178
 179             sub.        r0,r3,r4                                        ; to==from?
 180                         rlwinm  r8,r9,0,MSR_DR_BIT,MSR_DR_BIT   ; was translation on?
 181             cmpwi       cr1,r8,0                                        ; set cr1 beq if translation was off
 182                         oris    r8,r8,hi16(MASK(MSR_VEC))       ; Get vector enable
 183                         cmplwi  cr7,r5,0                                        ; Check if we have a 0 length
 184             beqlr-                                                              ; bail if to==from
 185                         ori             r8,r8,lo16(MASK(MSR_FP))        ; Get FP
 186                         mr              r6,r3                                           ; Set source
 187                         andc    r9,r9,r8                                        ; Turn off translation if it is on (should be) and FP, VEC
 188                         beqlr-  cr7                                                     ; Bail if length is 0
 189
 190                         crclr   restorex                                        ; Make sure we do not trash BATs on the way out
 191                         mtmsr   r9                                                      ; Set DR translation off
 192                         isync                                                           ; Wait for it
 193
 194                         crnot   fixxlate,cr1_eq                         ; Remember to turn on translation if it was
 195                         b               copyit32                                        ; Go copy it...
 196
 197 ; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
 198
 199 bcopy_phys64:                                                                   ; r9 = MSR
 200
 201                         srdi    r2,r3,31                                        ; (HACK) Get a 1 if source is in I/O memory
 202             srdi.       r0,r9,63-MSR_SF_BIT                     ; set cr0 beq on if SF was off when we were called
 203             rlwinm      r8,r9,MSR_DR_BIT+1,31,31        ; r8 <- DR bit right justified
 204             cmpld       cr1,r3,r4                                       ; to==from?
 205             li          r0,1                                            ; Note - we use this in a couple places below
 206                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
 207             cmpwi       cr7,r5,0                                        ; length==0 ?
 208             ori         r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))   ; Add in FP and DR
 209             beqlr--     cr1                                                     ; bail if to==from
 210                         srdi    r10,r4,31                                       ; (HACK) Get a 1 if sink is in I/O memory
 211             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on
 212             beqlr--     cr7                                                     ; bail if length==0
 213             andc        r9,r9,r6                                        ; turn DR, VEC, FP off
 214             cmpwi       cr1,r8,0                                        ; was DR on?
 215             crmove      restorex,cr0_eq                         ; if SF was off, remember to turn back off before we return
 216             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
 217                         cmpldi  cr0,r2,1                                        ; (HACK) Is source in I/O memory?
 218             isync                                                               ; wait for it to happen
 219                         mr              r6,r3                                           ; Set source
 220                         cmpldi  cr7,r10,1                                       ; (HACK) Is sink in I/O memory?
 221             crnot       fixxlate,cr1_eq                         ; if DR was on, remember to turn back on before we return
 222
 223                         cror    flipcache,cr0_eq,cr7_eq         ; (HACK) See if either source or sink is in I/O area
 224
 225                         rlwinm  r10,r9,MSR_EE_BIT+1,31,31       ; (HACK GLORIOUS HACK) Isolate the EE bit
 226                         sldi    r11,r0,31-MSR_EE_BIT            ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
 227                         sldi    r0,r0,32+8                                      ; (HACK) Get the right bit to turn off caching
 228                         bf++    flipcache,copyit64                      ; (HACK) No need to mess with caching...
 229
 230 ;
 231 ;                       HACK GLORIOUS HACK - when we force of caching, we need to also force off
 232 ;                       interruptions.  We are out of CR bits, so we need to stash the entry EE
 233 ;                       somewheres.  It is in the XER....  We NEED to change this!!!!
 234 ;
 235
 236                         mtxer   r10                                                     ; (HACK GLORIOUS HACK) Remember EE
 237                         andc    r9,r9,r11                                       ; (HACK GLORIOUS HACK) Turn off EE bit
 238                         mfspr   r2,hid4                                         ; (HACK) Get HID4
 239                         crset   noncache                                        ; (HACK) Set non-cached
 240                         mtmsrd  r9                                                      ; (HACK GLORIOUS HACK) Force off EE
 241                         or              r2,r2,r0                                        ; (HACK) Set bit to make real accesses cache-inhibited
 242                         sync                                                            ; (HACK) Sync up
 243                         li              r0,1
 244                         mtspr   hid4,r2                                         ; (HACK) Make real accesses cache-inhibited
 245                         isync                                                           ; (HACK) Toss prefetches
 246
 247                         lis             r12,0xE000                                      ; (HACK) Get the unlikeliest ESID possible
 248                         srdi    r12,r12,1                                       ; (HACK) Make 0x7FFFFFFFF0000000
 249                         slbie   r12                                                     ; (HACK) Make sure the ERAT is cleared
 250
 251                         sync                                                            ; (HACK)
 252                         isync                                                           ; (HACK)
 253
 254             b           copyit64
 255
 256
 257 ;
 258 ; void bcopy(from, to, nbytes)
 259 ;
 260
 261                         .align  5
 262                         .globl  EXT(bcopy)
 263
 264 LEXT(bcopy)
 265
 266                         crclr   noncache                                        ; Set cached
 267
 268 bcpswap:
 269                         crclr   flipcache                                       ; (HACK) No cache flip needed
 270             mfsprg      r8,2                                            ; get processor feature flags
 271             sub.        r0,r4,r3                                        ; test for to==from in mode-independent way
 272             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
 273                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
 274                         crclr   restorex                                        ; Make sure we do not trash BATs on the way out
 275                         mr              r6,r3                                           ; Set source
 276                         crclr   fixxlate                                        ; Set translation already ok
 277                         beqlr-                                                          ; Bail if "to" and "from" are the same
 278                         beqlr-  cr1                                                     ; Bail if length is 0
 279             bt++        pf64Bitb,copyit64                       ; handle 64-bit processor
 280                         b               copyit32                                        ; Go copy it...
 281
 282 ;
 283 ;                       When we move the memory, forward overlays must be handled.  We
 284 ;                       also can not use the cache instructions if we are from bcopy_nc.
 285 ;                       We need to preserve R3 because it needs to be returned for memcpy.
 286 ;                       We can be interrupted and lose control here.
 287 ;
 288 ;                       There is no stack, so in order to use vectors, we would
 289 ;                       need to take the vector exception. Any potential gains by using vectors
 290 ;                       would be more than eaten up by this.
 291 ;
 292 ;                       NOTE: this code is called in three "modes":
 293 ;                               - on 32-bit processors (32-byte cache line)
 294 ;                               - on 64-bit processors running in 32-bit mode (128-byte cache line)
 295 ;                               - on 64-bit processors running in 64-bit mode (128-byte cache line)
 296 ;
 297 ;                       ALSO NOTE: bcopy is called from copyin and copyout etc
 298 ;                       with the "thread_recover" ptr set.  This means bcopy must not set up a
 299 ;                       stack frame or touch non-volatile registers, and also means that it
 300 ;                       cannot rely on turning off interrupts, because we expect to get DSIs
 301 ;                       and have execution aborted by a "longjmp" to the thread_recover
 302 ;                       routine.
 303 ;
 304
 305                         .align  5
 306                         .globl  EXT(memcpy)
 307             ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
 308             ; processors...
 309 LEXT(memcpy)
 310                         crclr   flipcache                                       ; (HACK) No cache flip needed
 311             mfsprg      r8,2                                            ; get processor feature flags
 312                         cmplw   cr1,r3,r4                                       ; "to" and "from" the same?
 313             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
 314                         mr              r6,r4                                           ; Set the "from"
 315                         mr.             r5,r5                                           ; Length zero?
 316                         crclr   noncache                                        ; Set cached
 317                         mr              r4,r3                                           ; Set the "to"
 318                         crclr   fixxlate                                        ; Set translation already ok
 319                         beqlr-  cr1                                                     ; "to" and "from" are the same
 320                         beqlr-                                                          ; Length is 0
 321                         crclr   restorex                                        ; Make sure we do not trash BATs on the way out
 322             bt++        pf64Bitb,copyit64                       ; handle 64-bit processors
 323
 324 copyit32:       sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
 325                         lis             r8,0x7FFF                                       ; Start up a mask
 326                         srawi   r11,r12,31                                      ; Propagate the sign bit
 327                         dcbt    br0,r6                                          ; Touch in the first source line
 328                         cntlzw  r7,r5                                           ; Get the highest power of 2 factor of the length
 329                         ori             r8,r8,0xFFFF                            ; Make limit 0x7FFFFFFF
 330                         xor             r9,r12,r11                                      ; If sink - source was negative, invert bits
 331                         srw             r8,r8,r7                                        ; Get move length limitation
 332                         sub             r9,r9,r11                                       ; If sink - source was negative, add 1 and get absolute value
 333                         cmplw   r12,r5                                          ; See if we actually forward overlap
 334                         cmplwi  cr7,r9,32                                       ; See if at least a line between  source and sink
 335                         dcbtst  br0,r4                                          ; Touch in the first sink line
 336                         cmplwi  cr1,r5,32                                       ; Are we moving more than a line?
 337                         cror    noncache,noncache,cr7_lt        ; Set to not DCBZ output line if not enough space
 338                         blt-    fwdovrlap                                       ; This is a forward overlapping area, handle it...
 339
 340 ;
 341 ;                       R4 = sink
 342 ;                       R5 = length
 343 ;                       R6 = source
 344 ;
 345
 346 ;
 347 ;                       Here we figure out how much we have to move to get the sink onto a
 348 ;                       cache boundary.  If we can, and there are still more that 32 bytes
 349 ;                       left to move, we can really speed things up by DCBZing the sink line.
 350 ;                       We can not do this if noncache is set because we will take an
 351 ;                       alignment exception.
 352
 353 G4word:                                                                                 ; enter from 64-bit case with word aligned uncached operands
 354                         neg             r0,r4                                           ; Get the number of bytes to move to align to a line boundary
 355                         rlwinm. r0,r0,0,27,31                           ; Clean it up and test it
 356                         and             r0,r0,r8                                        ; limit to the maximum front end move
 357                         mtcrf   3,r0                                            ; Make branch mask for partial moves
 358                         sub             r5,r5,r0                                        ; Set the length left to move
 359                         beq             alline                                          ; Already on a line...
 360
 361                         bf              31,alhalf                                       ; No single byte to do...
 362                         lbz             r7,0(r6)                                        ; Get the byte
 363                         addi    r6,r6,1                                         ; Point to the next
 364                         stb             r7,0(r4)                                        ; Save the single
 365                         addi    r4,r4,1                                         ; Bump sink
 366
 367 ;                       Sink is halfword aligned here
 368
 369 alhalf:         bf              30,alword                                       ; No halfword to do...
 370                         lhz             r7,0(r6)                                        ; Get the halfword
 371                         addi    r6,r6,2                                         ; Point to the next
 372                         sth             r7,0(r4)                                        ; Save the halfword
 373                         addi    r4,r4,2                                         ; Bump sink
 374
 375 ;                       Sink is word aligned here
 376
 377 alword:         bf              29,aldouble                                     ; No word to do...
 378                         lwz             r7,0(r6)                                        ; Get the word
 379                         addi    r6,r6,4                                         ; Point to the next
 380                         stw             r7,0(r4)                                        ; Save the word
 381                         addi    r4,r4,4                                         ; Bump sink
 382
 383 ;                       Sink is double aligned here
 384
 385 aldouble:       bf              28,alquad                                       ; No double to do...
 386                         lwz             r7,0(r6)                                        ; Get the first word
 387                         lwz             r8,4(r6)                                        ; Get the second word
 388                         addi    r6,r6,8                                         ; Point to the next
 389                         stw             r7,0(r4)                                        ; Save the first word
 390                         stw             r8,4(r4)                                        ; Save the second word
 391                         addi    r4,r4,8                                         ; Bump sink
 392
 393 ;                       Sink is quadword aligned here
 394
 395 alquad:         bf              27,alline                                       ; No quad to do...
 396                         lwz             r7,0(r6)                                        ; Get the first word
 397                         lwz             r8,4(r6)                                        ; Get the second word
 398                         lwz             r9,8(r6)                                        ; Get the third word
 399                         stw             r7,0(r4)                                        ; Save the first word
 400                         lwz             r11,12(r6)                                      ; Get the fourth word
 401                         addi    r6,r6,16                                        ; Point to the next
 402                         stw             r8,4(r4)                                        ; Save the second word
 403                         stw             r9,8(r4)                                        ; Save the third word
 404                         stw             r11,12(r4)                                      ; Save the fourth word
 405                         addi    r4,r4,16                                        ; Bump sink
 406
 407 ;                       Sink is line aligned here
 408
 409 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 410                         mtcrf   3,r5                                            ; Make branch mask for backend partial moves
 411                         rlwinm  r11,r5,0,0,26                           ; Get number of bytes we are going to move
 412                         beq-    backend                                         ; No full lines to move
 413
 414                         sub             r5,r5,r11                                       ; Calculate the residual
 415                         li              r10,96                                          ; Stride for touch ahead
 416
 417 nxtline:        subic.  r0,r0,1                                         ; Account for the line now
 418
 419                         bt-             noncache,skipz                          ; Skip if we are not cached...
 420                         dcbz    br0,r4                                          ; Blow away the whole line because we are replacing it
 421                         dcbt    r6,r10                                          ; Touch ahead a bit
 422
 423 skipz:          lwz             r7,0(r6)                                        ; Get the first word
 424                         lwz             r8,4(r6)                                        ; Get the second word
 425                         lwz             r9,8(r6)                                        ; Get the third word
 426                         stw             r7,0(r4)                                        ; Save the first word
 427                         lwz             r11,12(r6)                                      ; Get the fourth word
 428                         stw             r8,4(r4)                                        ; Save the second word
 429                         lwz             r7,16(r6)                                       ; Get the fifth word
 430                         stw             r9,8(r4)                                        ; Save the third word
 431                         lwz             r8,20(r6)                                       ; Get the sixth word
 432                         stw             r11,12(r4)                                      ; Save the fourth word
 433                         lwz             r9,24(r6)                                       ; Get the seventh word
 434                         stw             r7,16(r4)                                       ; Save the fifth word
 435                         lwz             r11,28(r6)                                      ; Get the eighth word
 436                         addi    r6,r6,32                                        ; Point to the next
 437                         stw             r8,20(r4)                                       ; Save the sixth word
 438                         stw             r9,24(r4)                                       ; Save the seventh word
 439                         stw             r11,28(r4)                                      ; Save the eighth word
 440                         addi    r4,r4,32                                        ; Bump sink
 441                         bgt+    nxtline                                         ; Do the next line, if any...
 442
 443
 444 ;                       Move backend quadword
 445
 446 backend:        bf              27,noquad                                       ; No quad to do...
 447                         lwz             r7,0(r6)                                        ; Get the first word
 448                         lwz             r8,4(r6)                                        ; Get the second word
 449                         lwz             r9,8(r6)                                        ; Get the third word
 450                         lwz             r11,12(r6)                                      ; Get the fourth word
 451                         stw             r7,0(r4)                                        ; Save the first word
 452                         addi    r6,r6,16                                        ; Point to the next
 453                         stw             r8,4(r4)                                        ; Save the second word
 454                         stw             r9,8(r4)                                        ; Save the third word
 455                         stw             r11,12(r4)                                      ; Save the fourth word
 456                         addi    r4,r4,16                                        ; Bump sink
 457
 458 ;                       Move backend double
 459
 460 noquad:         bf              28,nodouble                                     ; No double to do...
 461                         lwz             r7,0(r6)                                        ; Get the first word
 462                         lwz             r8,4(r6)                                        ; Get the second word
 463                         addi    r6,r6,8                                         ; Point to the next
 464                         stw             r7,0(r4)                                        ; Save the first word
 465                         stw             r8,4(r4)                                        ; Save the second word
 466                         addi    r4,r4,8                                         ; Bump sink
 467
 468 ;                       Move backend word
 469
 470 nodouble:       bf              29,noword                                       ; No word to do...
 471                         lwz             r7,0(r6)                                        ; Get the word
 472                         addi    r6,r6,4                                         ; Point to the next
 473                         stw             r7,0(r4)                                        ; Save the word
 474                         addi    r4,r4,4                                         ; Bump sink
 475
 476 ;                       Move backend halfword
 477
 478 noword:         bf              30,nohalf                                       ; No halfword to do...
 479                         lhz             r7,0(r6)                                        ; Get the halfword
 480                         addi    r6,r6,2                                         ; Point to the next
 481                         sth             r7,0(r4)                                        ; Save the halfword
 482                         addi    r4,r4,2                                         ; Bump sink
 483
 484 ;                       Move backend byte
 485
 486 nohalf:         bf              31,bcpydone                                     ; Leave cuz we are all done...
 487                         lbz             r7,0(r6)                                        ; Get the byte
 488                         stb             r7,0(r4)                                        ; Save the single
 489
 490 bcpydone:
 491                         mfmsr   r9                                                      ; Get the MSR
 492                         bf++    flipcache,bcpydone0                     ; (HACK) No need to mess with caching...
 493
 494                         li              r0,1                                            ; (HACK) Get a 1
 495                         mfxer   r10                                                     ; (HACK GLORIOUS HACK) Get the entry EE
 496                         sldi    r0,r0,32+8                                      ; (HACK) Get the right bit to turn off caching
 497                         mfspr   r2,hid4                                         ; (HACK) Get HID4
 498                         rlwinm  r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT     ; (HACK GLORIOUS HACK) Set the EE bit
 499                         andc    r2,r2,r0                                        ; (HACK) Clear bit to make real accesses cache-inhibited
 500                         or              r9,r9,r10                                       ; (HACK GLORIOUS HACK) Set the EE in MSR
 501                         sync                                                            ; (HACK) Sync up
 502                         mtspr   hid4,r2                                         ; (HACK) Make real accesses not cache-inhibited
 503                         isync                                                           ; (HACK) Toss prefetches
 504
 505                         lis             r12,0xE000                                      ; (HACK) Get the unlikeliest ESID possible
 506                         srdi    r12,r12,1                                       ; (HACK) Make 0x7FFFFFFFF0000000
 507                         slbie   r12                                                     ; (HACK) Make sure the ERAT is cleared
 508
 509                         mtmsr   r9                                                      ; (HACK GLORIOUS HACK) Set EE properly
 510
 511 bcpydone0:
 512                         lis             r0,hi16(MASK(MSR_VEC))          ; Get the vector bit
 513                         ori             r0,r0,lo16(MASK(MSR_FP))        ; Get the float bit
 514                         bf++    fixxlate,bcpydone1                      ; skip if we do not need to fix translation...
 515                         ori             r9,r9,lo16(MASK(MSR_DR))        ; Turn data translation on
 516                         andc    r9,r9,r0                                        ; Make sure that FP and VEC are off
 517                         mtmsr   r9                                                      ; Just do it
 518                         isync                                                           ; Hang in there
 519
 520 bcpydone1:
 521             bflr++      restorex                                        ; done if we do not have to fix up addressing
 522             mfsprg      r8,2                                            ; get the feature flags again
 523             mtcrf       0x02,r8                                         ; put pf64Bit where we can test it
 524             bt++        pf64Bitb,bcpydone2                      ; skip if 64-bit processor
 525
 526             ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
 527
 528             li          r0,0                                            ; Get set to invalidate upper half
 529                         sync                                                            ; Make sure all is well
 530                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
 531                         mtdbatu 1,r0                                            ; Clear source upper DBAT
 532                         sync
 533                         isync
 534                         blr
 535
 536             ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
 537
 538 bcpydone2:
 539             mfmsr       r9                                                      ; get MSR again
 540                         andc    r9,r9,r0                                        ; Make sure that FP and VEC are off
 541             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
 542             mtmsrd      r9
 543             isync
 544             blr
 545
 546
 547 ;
 548 ;                       0123456789ABCDEF0123456789ABCDEF
 549 ;                        0123456789ABCDEF0123456789ABCDEF
 550 ;                                                                                   F
 551 ;                                                                                 DE
 552 ;                                                                         9ABC
 553 ;                                                         12345678
 554 ;             123456789ABCDEF0
 555 ;            0
 556
 557 ;
 558 ;                       Here is where we handle a forward overlapping move.  These will be slow
 559 ;                       because we can not kill the cache of the destination until after we have
 560 ;                       loaded/saved the source area.  Also, because reading memory backwards is
 561 ;                       slower when the cache line needs to be loaded because the critical
 562 ;                       doubleword is loaded first, i.e., the last, then it goes back to the first,
 563 ;                       and on in order.  That means that when we are at the second to last DW we
 564 ;                       have to wait until the whole line is in cache before we can proceed.
 565 ;
 566
 567 G4reverseWord:                                                                  ; here from 64-bit code with word aligned uncached operands
 568 fwdovrlap:      add             r4,r5,r4                                        ; Point past the last sink byte
 569                         add             r6,r5,r6                                        ; Point past the last source byte
 570                         and             r0,r4,r8                                        ; Apply movement limit
 571                         li              r12,-1                                          ; Make sure we touch in the actual line
 572                         mtcrf   3,r0                                            ; Figure out the best way to move backwards
 573                         dcbt    r12,r6                                          ; Touch in the last line of source
 574                         rlwinm. r0,r0,0,27,31                           ; Calculate the length to adjust to cache boundary
 575                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
 576                         beq-    balline                                         ; Aready on cache line boundary
 577
 578                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
 579
 580                         bf              31,balhalf                                      ; No single byte to do...
 581                         lbz             r7,-1(r6)                                       ; Get the byte
 582                         subi    r6,r6,1                                         ; Point to the next
 583                         stb             r7,-1(r4)                                       ; Save the single
 584                         subi    r4,r4,1                                         ; Bump sink
 585
 586 ;                       Sink is halfword aligned here
 587
 588 balhalf:        bf              30,balword                                      ; No halfword to do...
 589                         lhz             r7,-2(r6)                                       ; Get the halfword
 590                         subi    r6,r6,2                                         ; Point to the next
 591                         sth             r7,-2(r4)                                       ; Save the halfword
 592                         subi    r4,r4,2                                         ; Bump sink
 593
 594 ;                       Sink is word aligned here
 595
 596 balword:        bf              29,baldouble                            ; No word to do...
 597                         lwz             r7,-4(r6)                                       ; Get the word
 598                         subi    r6,r6,4                                         ; Point to the next
 599                         stw             r7,-4(r4)                                       ; Save the word
 600                         subi    r4,r4,4                                         ; Bump sink
 601
 602 ;                       Sink is double aligned here
 603
 604 baldouble:      bf              28,balquad                                      ; No double to do...
 605                         lwz             r7,-8(r6)                                       ; Get the first word
 606                         lwz             r8,-4(r6)                                       ; Get the second word
 607                         subi    r6,r6,8                                         ; Point to the next
 608                         stw             r7,-8(r4)                                       ; Save the first word
 609                         stw             r8,-4(r4)                                       ; Save the second word
 610                         subi    r4,r4,8                                         ; Bump sink
 611
 612 ;                       Sink is quadword aligned here
 613
 614 balquad:        bf              27,balline                                      ; No quad to do...
 615                         lwz             r7,-16(r6)                                      ; Get the first word
 616                         lwz             r8,-12(r6)                                      ; Get the second word
 617                         lwz             r9,-8(r6)                                       ; Get the third word
 618                         lwz             r11,-4(r6)                                      ; Get the fourth word
 619                         stw             r7,-16(r4)                                      ; Save the first word
 620                         subi    r6,r6,16                                        ; Point to the next
 621                         stw             r8,-12(r4)                                      ; Save the second word
 622                         stw             r9,-8(r4)                                       ; Save the third word
 623                         stw             r11,-4(r4)                                      ; Save the fourth word
 624                         subi    r4,r4,16                                        ; Bump sink
 625
 626 ;                       Sink is line aligned here
 627
 628 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
 629                         mtcrf   3,r5                                            ; Make branch mask for backend partial moves
 630                         beq-    bbackend                                        ; No full lines to move
 631
 632
 633 ;                       Registers in use: R0, R1,     R3, R4, R5, R6
 634 ;       Registers not in use:         R2,                 R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
 635
 636 bnxtline:       subic.  r0,r0,1                                         ; Account for the line now
 637
 638                         lwz             r7,-32(r6)                                      ; Get the first word
 639                         lwz             r5,-28(r6)                                      ; Get the second word
 640                         lwz             r2,-24(r6)                                      ; Get the third word
 641                         lwz             r12,-20(r6)                                     ; Get the third word
 642                         lwz             r11,-16(r6)                                     ; Get the fifth word
 643                         lwz             r10,-12(r6)                                     ; Get the sixth word
 644                         lwz             r9,-8(r6)                                       ; Get the seventh word
 645                         lwz             r8,-4(r6)                                       ; Get the eighth word
 646                         subi    r6,r6,32                                        ; Point to the next
 647
 648                         stw             r7,-32(r4)                                      ; Get the first word
 649                         ble-    bnotouch                                        ; Last time, skip touch of source...
 650                         dcbt    br0,r6                                          ; Touch in next source line
 651
 652 bnotouch:       stw             r5,-28(r4)                                      ; Get the second word
 653                         stw             r2,-24(r4)                                      ; Get the third word
 654                         stw             r12,-20(r4)                                     ; Get the third word
 655                         stw             r11,-16(r4)                                     ; Get the fifth word
 656                         stw             r10,-12(r4)                                     ; Get the sixth word
 657                         stw             r9,-8(r4)                                       ; Get the seventh word
 658                         stw             r8,-4(r4)                                       ; Get the eighth word
 659                         subi    r4,r4,32                                        ; Bump sink
 660
 661                         bgt+    bnxtline                                        ; Do the next line, if any...
 662
 663 ;
 664 ;                       Note: We touched these lines in at the beginning
 665 ;
 666
 667 ;                       Move backend quadword
 668
 669 bbackend:       bf              27,bnoquad                                      ; No quad to do...
 670                         lwz             r7,-16(r6)                                      ; Get the first word
 671                         lwz             r8,-12(r6)                                      ; Get the second word
 672                         lwz             r9,-8(r6)                                       ; Get the third word
 673                         lwz             r11,-4(r6)                                      ; Get the fourth word
 674                         stw             r7,-16(r4)                                      ; Save the first word
 675                         subi    r6,r6,16                                        ; Point to the next
 676                         stw             r8,-12(r4)                                      ; Save the second word
 677                         stw             r9,-8(r4)                                       ; Save the third word
 678                         stw             r11,-4(r4)                                      ; Save the fourth word
 679                         subi    r4,r4,16                                        ; Bump sink
 680
 681 ;                       Move backend double
 682
 683 bnoquad:        bf              28,bnodouble                            ; No double to do...
 684                         lwz             r7,-8(r6)                                       ; Get the first word
 685                         lwz             r8,-4(r6)                                       ; Get the second word
 686                         subi    r6,r6,8                                         ; Point to the next
 687                         stw             r7,-8(r4)                                       ; Save the first word
 688                         stw             r8,-4(r4)                                       ; Save the second word
 689                         subi    r4,r4,8                                         ; Bump sink
 690
 691 ;                       Move backend word
 692
 693 bnodouble:      bf              29,bnoword                                      ; No word to do...
 694                         lwz             r7,-4(r6)                                       ; Get the word
 695                         subi    r6,r6,4                                         ; Point to the next
 696                         stw             r7,-4(r4)                                       ; Save the word
 697                         subi    r4,r4,4                                         ; Bump sink
 698
 699 ;                       Move backend halfword
 700
 701 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
 702                         lhz             r7,-2(r6)                                       ; Get the halfword
 703                         subi    r6,r6,2                                         ; Point to the next
 704                         sth             r7,-2(r4)                                       ; Save the halfword
 705                         subi    r4,r4,2                                         ; Bump sink
 706
 707 ;                       Move backend byte
 708
 709 bnohalf:        bflr    31                                                      ; Leave cuz we are all done...
 710                         lbz             r7,-1(r6)                                       ; Get the byte
 711                         stb             r7,-1(r4)                                       ; Save the single
 712
 713                         b               bcpydone                                        ; Go exit cuz we are all done...
 714
 715
 716 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
 717 // called either in 32 or 64-bit mode, which makes the test for reverse moves
 718 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
 719 // special cases.
 720 //
 721 // When entered:
 722 //              r4 = destination (32 or 64-bit ptr)
 723 //              r5 = length (always 32 bits)
 724 //              r6 = source (32 or 64-bit ptr)
 725 //              cr5 = noncache, fixxlate, flipcache, and restorex flags set
 726
 727         .align  5
 728 copyit64:
 729         lis             r2,0x4000                       // r2 = 0x00000000 40000000
 730         neg             r12,r4                          // start to compute #bytes to align dest
 731                 bt--    noncache,noncache1      // (HACK) Do not even try anything cached...
 732         dcbt    0,r6                            // touch in 1st block of source
 733 noncache1:
 734         add.    r2,r2,r2                        // if 0x00000000 80000000 < 0, we are in 32-bit mode
 735         cntlzw  r9,r5                           // get highest power-of-2 in length
 736         rlwinm  r7,r12,0,25,31          // r7 <- bytes to 128-byte align dest
 737                 bt--    noncache,noncache2      // (HACK) Do not even try anything cached...
 738         dcbtst  0,r4                            // touch in 1st destination cache block
 739 noncache2:
 740         sraw    r2,r2,r9                        // get mask with 1s for leading 0s in length, plus 1 more 1-bit
 741         bge             copyit64a                       // skip if we are running in 64-bit mode
 742         rlwinm  r4,r4,0,0,31            // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
 743         rlwinm  r5,r5,0,0,31
 744         rlwinm  r6,r6,0,0,31
 745 copyit64a:                                                      // now we can use 64-bit compares even if running in 32-bit mode
 746         sub             r8,r4,r6                        // get (dest-source)
 747         andc    r7,r7,r2                        // limit bytes to align by operand length
 748         cmpld   cr1,r8,r5                       // if (dest-source)<length, must move reverse
 749         bt--    noncache,c64uncached    // skip if uncached
 750         blt--   cr1,c64rdouble          // handle cached reverse moves
 751
 752
 753 // Forward, cached or doubleword aligned uncached.  This is the common case.
 754 //   r4-r6 = dest, length, source (as above)
 755 //              r7 = #bytes 128-byte align dest (limited by copy length)
 756 //     cr5 = flags, as above
 757
 758 c64double:
 759         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
 760         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
 761         sub             r5,r5,r7                        // adjust length remaining
 762         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
 763         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
 764         cmpwi   cr7,r10,0                       // set cr7 on chunk count
 765         beq             c64double2                      // dest already doubleword aligned
 766         mtctr   r8
 767         b               c64double1
 768
 769         .align  5                                       // align inner loops
 770 c64double1:                                                     // copy bytes until dest is doubleword aligned
 771         lbz             r0,0(r6)
 772         addi    r6,r6,1
 773         stb             r0,0(r4)
 774         addi    r4,r4,1
 775         bdnz    c64double1
 776
 777 c64double2:                                                     // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
 778         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
 779         mtctr   r9
 780         b               c64double3
 781
 782         .align  5                                       // align inner loops
 783 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
 784         ld              r7,0(r6)
 785         addi    r6,r6,8
 786         std             r7,0(r4)
 787         addi    r4,r4,8
 788         bdnz    c64double3
 789
 790 // Here to xfer 128-byte chunks, if any.  Because the IBM 970 cannot issue two stores/cycle,
 791 // we pipeline the inner loop so we can pair loads and stores.  Since we only have 8 GPRs for
 792 // data (64 bytes), we load/store each twice per 128-byte chunk.
 793
 794 c64double4:                                                     // r10/cr7=128-byte chunks
 795         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
 796         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
 797         beq             cr7,c64double7          // no 128-byte chunks
 798         sub             r8,r6,r4                        // r8 <- (source - dest)
 799         li              r9,128                          // start at next cache line (we've already touched in 1st line)
 800         cmpldi  cr7,r8,128                      // if (source-dest)<128, cannot use dcbz128 beacause of overlap
 801         cror    noncache,cr7_lt,noncache        // turn on "noncache" flag if (source-dest)<128
 802                 bt--    noncache,noncache3      // (HACK) Skip cache touch if noncachable
 803         dcbt128 r9,r6,1                         // start forward stream
 804 noncache3:
 805         mtctr   r10
 806
 807         ld              r0,0(r6)                        // start pipe: load 1st half-line
 808         ld              r2,8(r6)
 809         ld              r7,16(r6)
 810         ld              r8,24(r6)
 811         ld              r9,32(r6)
 812         ld              r10,40(r6)
 813         ld              r11,48(r6)
 814         ld              r12,56(r6)
 815                 b               c64InnerLoopEntryPt
 816
 817         .align  5                                       // align inner loop
 818 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
 819         std             r0,64(r4)                       // store 2nd half of chunk n
 820         ld              r0,0(r6)                        // load 1st half of chunk n+1
 821         std             r2,72(r4)
 822         ld              r2,8(r6)
 823         std             r7,80(r4)
 824         ld              r7,16(r6)
 825         std             r8,88(r4)
 826         ld              r8,24(r6)
 827         std             r9,96(r4)
 828         ld              r9,32(r6)
 829         std             r10,104(r4)
 830         ld              r10,40(r6)
 831         std             r11,112(r4)
 832         ld              r11,48(r6)
 833         std             r12,120(r4)
 834         ld              r12,56(r6)
 835         addi    r4,r4,128                       // advance to next dest chunk
 836 c64InnerLoopEntryPt:                            // initial entry into loop, with 1st halfline loaded
 837         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
 838         dcbz128 0,r4                            // avoid prefetch of next cache line
 839 c64InnerLoop1:
 840         std             r0,0(r4)                        // store 1st half of chunk n
 841         ld              r0,64(r6)                       // load 2nd half of chunk n
 842         std             r2,8(r4)
 843         ld              r2,72(r6)
 844         std             r7,16(r4)
 845         ld              r7,80(r6)
 846         std             r8,24(r4)
 847         ld              r8,88(r6)
 848         std             r9,32(r4)
 849         ld              r9,96(r6)
 850         std             r10,40(r4)
 851         ld              r10,104(r6)
 852         std             r11,48(r4)
 853         ld              r11,112(r6)
 854         std             r12,56(r4)
 855         ld              r12,120(r6)
 856         addi    r6,r6,128                       // advance to next source chunk if any
 857         bdnz    c64InnerLoop            // loop if more chunks
 858
 859         std             r0,64(r4)                       // store 2nd half of last chunk
 860         std             r2,72(r4)
 861         std             r7,80(r4)
 862         std             r8,88(r4)
 863         std             r9,96(r4)
 864         std             r10,104(r4)
 865         std             r11,112(r4)
 866         std             r12,120(r4)
 867         addi    r4,r4,128                       // advance to next dest chunk
 868
 869 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
 870         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
 871         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
 872         beq             cr1,c64byte                     // no leftover doublewords
 873         mtctr   r0
 874         b               c64double8
 875
 876         .align  5                                       // align inner loop
 877 c64double8:                                                     // loop copying leftover doublewords
 878         ld              r0,0(r6)
 879         addi    r6,r6,8
 880         std             r0,0(r4)
 881         addi    r4,r4,8
 882         bdnz    c64double8
 883
 884
 885 // Forward byte loop.
 886
 887 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
 888                 beq             bcpydone                        // done if no leftover bytes
 889         mtctr   r5
 890         b               c64byte1
 891
 892         .align  5                                       // align inner loop
 893 c64byte1:
 894         lbz             r0,0(r6)
 895         addi    r6,r6,1
 896         stb             r0,0(r4)
 897         addi    r4,r4,1
 898         bdnz    c64byte1
 899
 900         b               bcpydone
 901
 902
 903 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
 904 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
 905 // a byte at a time, but that is still much faster than alignment exceptions.
 906 //   r4-r6 = dest, length, source (as above)
 907 //              r2 = mask of 1s for leading 0s in length, plus 1 extra 1
 908 //              r7 = #bytes to copy to 128-byte align dest (limited by operand length)
 909 //         cr1 = blt if reverse move required
 910
 911 c64uncached:
 912         xor             r0,r6,r4                        // get relative alignment
 913         rlwinm  r10,r0,0,29,31          // relatively doubleword aligned?
 914         rlwinm  r11,r0,0,30,31          // relatively word aligned?
 915         not             r8,r2                           // get mask to limit initial length of copy for G4word
 916         blt             cr1,c64reverseUncached
 917
 918         cmpwi   cr0,r10,0                       // set cr0 beq if doubleword aligned
 919         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
 920         beq             cr0,c64double           // doubleword aligned
 921         beq             cr1,G4word                      // word aligned, use G3/G4 code
 922         cmpwi   r5,0                            // set cr0 on byte count
 923         b               c64byte                         // unaligned operands
 924
 925 c64reverseUncached:
 926         cmpwi   cr0,r10,0                       // set cr0 beq if doubleword aligned
 927         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
 928         beq             cr0,c64rdouble          // doubleword aligned so can use LD/STD
 929         beq             cr1,G4reverseWord       // word aligned, use G3/G4 code
 930         add             r6,r6,r5                        // point to (end+1) of source and dest
 931         add             r4,r4,r5
 932         cmpwi   r5,0                            // set cr0 on length
 933         b               c64rbyte                        // copy a byte at a time
 934
 935
 936
 937 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
 938 // aligned uncached copies.
 939 //              r4 = destination (32 or 64-bit ptr)
 940 //              r5 = length (always 32 bits)
 941 //              r6 = source (32 or 64-bit ptr)
 942 //              cr5 = noncache, fixxlate, and restorex flags set
 943
 944 c64rdouble:
 945         add             r6,r6,r5                        // point to (end+1) of source and dest
 946         add             r4,r4,r5
 947         rlwinm. r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
 948         cmplw   cr1,r7,r5                       // operand long enough to doubleword align?
 949         blt             cr1,c64rd0                      // yes
 950         mr              r7,r5                           // no
 951 c64rd0:
 952         sub             r5,r5,r7                        // adjust length
 953         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
 954         cmpwi   cr1,r8,0                        // any chunks?
 955         beq             c64rd2                          // source already doubleword aligned
 956         mtctr   r7
 957
 958 c64rd1:                                                         // copy bytes until source doublword aligned
 959         lbzu    r0,-1(r6)
 960         stbu    r0,-1(r4)
 961         bdnz    c64rd1
 962
 963 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
 964         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
 965         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
 966         cmpwi   cr7,r0,0                        // leftover doublewords?
 967         beq             cr1,c64rd4                      // no chunks to xfer
 968         li              r9,-128                         // start at next cache line
 969         mtctr   r8
 970         bt              noncache,c64rd3         // (HACK) Do not start a stream if noncachable...
 971         dcbt128 r9,r6,3                         // start reverse stream
 972         b               c64rd3
 973
 974         .align  5                                       // align inner loop
 975 c64rd3:                                                         // loop copying 64-byte chunks
 976         ld              r7,-8(r6)
 977         ld              r8,-16(r6)
 978         ld              r9,-24(r6)
 979         ld              r10,-32(r6)
 980         ld              r11,-40(r6)
 981         ld              r12,-48(r6)
 982         std             r7,-8(r4)
 983         std             r8,-16(r4)
 984         ld              r7,-56(r6)
 985         ldu             r8,-64(r6)
 986         std             r9,-24(r4)
 987         std             r10,-32(r4)
 988         std             r11,-40(r4)
 989         std             r12,-48(r4)
 990         std             r7,-56(r4)
 991         stdu    r8,-64(r4)
 992         bdnz    c64rd3
 993
 994 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
 995         beq             cr7,c64rbyte            // no leftover doublewords
 996         mtctr   r0
 997
 998 c64rd5:                                                         // loop copying leftover doublewords
 999         ldu             r0,-8(r6)
1000         stdu    r0,-8(r4)
1001         bdnz    c64rd5
1002
1003
1004 // Reverse byte loop.
1005
1006 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
1007         beq             bcpydone                        // done if no leftover bytes
1008         mtctr   r5
1009
1010 c64rbyte1:
1011         lbzu    r0,-1(r6)
1012         stbu    r0,-1(r4)
1013         bdnz    c64rbyte1
1014
1015         b               bcpydone
1016