bsd/dev/ppc/xsumas.s

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30
  31 #define kShort  11
  32 #define cr1_gt  5       // bit 1 of cr1
  33
  34 /*
  35  * short xsum_assym( short *p, int len, short xsum, boolean odd);
  36  *
  37  *  r3 - Pointer to data
  38  *  r4 - Length of data
  39  *  r5 - Accumulated sum value
  40  *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
  41  *
  42  * Note: If the "odd" flag is set, the address in r3 will be even.  Nonetheless, we
  43  *       correctly handle the case where the flag is set and the address is odd.
  44  *
  45  * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
  46  * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
  47  * via "add with carry" operations on a 2s-complement machine like PPC.  Note that
  48  * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
  49  * final sum is folded down to 16 bits.  On 32-bit machines we use "adde", which is
  50  * perfect except that it serializes the adds on the carry bit.  On 64-bit machines
  51  * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
  52  * all 64-bits into a 16-bit sum at the end.  We cannot use "adde" on 64-bit sums,
  53  * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
  54  * is set on the low 32-bits of the sum.)
  55  *
  56  * Using Altivec is tempting, but the performance impact of the greatly increased
  57  * number of exceptions and register save/restore traffic probably make it impractical
  58  * for now.
  59  */
  60         .globl  _xsum_assym
  61         .globl  _xsum_nop_if_32bit
  62         .text
  63         .align  5
  64 _xsum_assym:
  65         cmplwi  cr0,r4,kShort   ; too short to word align?
  66         rlwinm  r2,r3,0,0x3     ; get byte offset in word
  67         dcbt    0,r3            ; touch in 1st cache line
  68         cmpwi   cr6,r2,0        ; is address word aligned?
  69         ble     cr0,Lshort      ; skip if too short to bother aligning
  70
  71         subfic  r0,r2,4         ; get #bytes in partial word
  72         cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
  73         addic   r0,r0,0         ; turn off carry
  74         beq     cr6,Laligned    ; skip if already word aligned (r2==0 if aligned)
  75
  76 ;       Partial word at start: zero filled on left, it becomes initial checksum.
  77
  78         rlwinm  r3,r3,0,0,29    ; word align address
  79         mtcrf   0x01,r2         ; move byte offset to cr7
  80         lwz     r6,0(r3)        ; get partial word
  81         li      r7,-1           ; start of mask for partial fill
  82         slwi    r8,r2,3         ; multiply byte offset by 8
  83         sub     r4,r4,r0        ; adjust length for bytes in partial word
  84         crxor   cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
  85         srw     r7,r7,r8        ; get mask for bytes to keep in partial word
  86         addi    r3,r3,4         ; point to next word of input
  87         and     r2,r6,r7        ; zero fill on left
  88
  89 ;       Address is now word aligned.  Prepare for inner loop over 32-byte chunks.
  90 ;           r2 = initial checksum
  91 ;           r3 = word aligned address
  92 ;           r4 = length remaining
  93 ;           r5 = accumulated sum parameter
  94 ;        carry = off
  95 ;       cr1_gt = "starting on odd address" flag
  96
  97 Laligned:
  98         srwi.   r0,r4,5         ; get count of 32-byte chunks
  99         mtcrf   0x02,r4         ; move residual length to cr6 and cr7
 100         mtcrf   0x01,r4
 101         beq     cr0,Lleftovers  ; no chunks
 102
 103         mtctr   r0              ; set up loop count
 104         li      r4,32           ; offset to next chunk
 105 _xsum_nop_if_32bit:
 106         b       L64BitPath      ; use the 64-bit path (patched to nop on 32-bit machine)
 107         dcbt    r4,r3           ; touch in 2nd cache line
 108         li      r0,96           ; get touch offset
 109         b       LInnerLoop32    ; enter 32-bit loop
 110
 111 ;       Inner loop for 32-bit machines.
 112
 113         .align  4
 114 LInnerLoop32:
 115         lwz     r4,0(r3)
 116         lwz     r6,4(r3)
 117         lwz     r7,8(r3)
 118         lwz     r8,12(r3)
 119         adde    r2,r2,r4
 120         lwz     r9,16(r3)
 121         adde    r2,r2,r6
 122         lwz     r10,20(r3)
 123         adde    r2,r2,r7
 124         lwz     r11,24(r3)
 125         adde    r2,r2,r8
 126         lwz     r12,28(r3)
 127         adde    r2,r2,r9
 128         dcbt    r3,r0
 129         adde    r2,r2,r10
 130         addi    r3,r3,32
 131         adde    r2,r2,r11
 132         adde    r2,r2,r12
 133         bdnz+   LInnerLoop32
 134
 135 ;       Handle leftover bytes.
 136 ;           r2 = checksum so far
 137 ;           r3 = word aligned address
 138 ;           r5 = accumulated sum parameter
 139 ;        carry = live
 140 ;       cr1_gt = "starting on odd address" flag
 141 ;      cr6,cr7 = residual length
 142
 143 Lleftovers:
 144         bf      27,Lleftover8   ; test 0x10 bit of residual length
 145         lwz     r4,0(r3)
 146         lwz     r6,4(r3)
 147         lwz     r7,8(r3)
 148         lwz     r8,12(r3)
 149         addi    r3,r3,16
 150         adde    r2,r2,r4
 151         adde    r2,r2,r6
 152         adde    r2,r2,r7
 153         adde    r2,r2,r8
 154 Lleftover8:
 155         bf      28,Lleftover4
 156         lwz     r4,0(r3)
 157         lwz     r6,4(r3)
 158         addi    r3,r3,8
 159         adde    r2,r2,r4
 160         adde    r2,r2,r6
 161 Lleftover4:
 162         bf      29,Lleftover2
 163         lwz     r4,0(r3)
 164         addi    r3,r3,4
 165         adde    r2,r2,r4
 166 Lleftover2:
 167         bf      30,Lleftover1
 168         lhz     r4,0(r3)
 169         addi    r3,r3,2
 170         adde    r2,r2,r4
 171 Lleftover1:
 172         bf      31,Lwrapup
 173         lbz     r4,0(r3)
 174         slwi    r4,r4,8         ; shift last byte into proper lane
 175         adde    r2,r2,r4
 176
 177 ;       All data bytes checksummed.  Wrap up.
 178 ;           r2 = checksum so far (word parallel)
 179 ;           r5 = accumulated sum parameter
 180 ;        carry = live
 181 ;       cr1_gt = "starting on odd address" flag
 182
 183 Lwrapup:
 184         addze   r2,r2           ; add in last carry
 185         addze   r2,r2           ; in case the "addze" carries
 186 Lwrapupx:                       ; here from short-operand case, with xer(ca) undefined
 187         srwi    r6,r2,16        ; top half of 32-bit checksum
 188         rlwinm  r7,r2,0,0xFFFF  ; lower half
 189         add     r2,r6,r7        ; add them together
 190         srwi    r6,r2,16        ; then do it again, in case first carried
 191         rlwinm  r7,r2,0,0xFFFF
 192         add     r2,r6,r7
 193         bf      cr1_gt,Lswapped ; test "starting on odd address" flag
 194
 195 ;       The checksum began on an odd address, so swap bytes.
 196
 197         rlwinm  r6,r2,24,0x00FF ; move top byte to bottom
 198         rlwinm  r7,r2,8,0xFF00  ; bottom to top
 199         or      r2,r6,r7        ; rejoin
 200
 201 ;       Finally, add in checksum passed in as a parameter.
 202
 203 Lswapped:
 204         add     r2,r2,r5        ; add passed-in checksum
 205         srwi    r6,r2,16        ; top half of 32-bit checksum
 206         rlwinm  r7,r2,0,0xFFFF  ; lower half
 207         add     r2,r6,r7        ; add them together
 208         srwi    r6,r2,16        ; then do it again, in case first carried
 209         rlwinm  r7,r2,0,0xFFFF
 210         add     r3,r6,r7        ; steer result into r3
 211         blr
 212
 213 ;       Handle short operands.  Do a halfword at a time.
 214 ;           r3 = address
 215 ;           r4 = length (<= kShort)
 216 ;           r5 = accumulated sum parameter
 217 ;           r6 = "starting on odd byte" flag
 218
 219 Lshort:
 220         cmpwi   cr6,r4,2        ; at least two bytes?
 221         andi.   r0,r4,1         ; odd length?
 222         li      r2,0            ; initialize checksum
 223         cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
 224         blt     cr6,Lshort2     ; fewer than two bytes, so skip
 225 Lshort1:
 226         cmpwi   cr6,r4,4        ; two more bytes (after we decrement)?
 227         lhz     r7,0(r3)
 228         subi    r4,r4,2
 229         addi    r3,r3,2
 230         add     r2,r2,r7        ; note no need for "adde"
 231         bge     cr6,Lshort1     ; loop for 2 more bytes
 232 Lshort2:
 233         beq     Lwrapupx        ; no byte at end, proceed to checkout with carry undefined
 234         lbz     r7,0(r3)
 235         slwi    r7,r7,8         ; shift last byte into proper lane
 236         add     r2,r2,r7
 237         b       Lwrapupx
 238
 239 ;       Handle 64-bit machine.  The major improvement over the 32-bit path is that we use
 240 ;       four parallel 32-bit accumulators, which carry into the upper half naturally so we
 241 ;       do not have to use "adde", which serializes on the carry bit.  Note that we cannot
 242 ;       do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
 243 ;           r2 = checksum so far (ie, the zero-filled partial first word)
 244 ;           r3 = word aligned address
 245 ;           r5 = accumulated sum parameter
 246 ;          ctr = number of 32-byte chunks of input
 247 ;        carry = unused in this code
 248 ;       cr1_gt = "starting on odd address" flag
 249 ;      cr6,cr7 = residual length
 250
 251 L64BitPath:
 252         stw     r13,-4(r1)      ; save a few nonvolatile regs in red zone so we can use them
 253         stw     r14,-8(r1)
 254         stw     r15,-12(r1)
 255         stw     r16,-16(r1)
 256         li      r0,128          ; to touch next line
 257         li      r13,0           ; r13-r15 are the accumulators, so initialize them
 258         dcbt    r3,r0           ; touch in next cache line, and keep loads away from the above stores
 259         lwz     r4,0(r3)        ; start pipeline by loading first 32 bytes into r4, r6-r12
 260         lwz     r6,4(r3)
 261         lwz     r7,8(r3)
 262         mr      r14,r2          ; just copy incoming partial word into one of the accumulators
 263         li      r15,0
 264         lwz     r8,12(r3)
 265         lwz     r9,16(r3)
 266         li      r16,0
 267         li      r0,256          ; get touch offset
 268         lwz     r10,20(r3)
 269         lwz     r11,24(r3)
 270         lwz     r12,28(r3)      ; load last word of previous chunk
 271         addi    r3,r3,32        ; skip past the chunk
 272         bdnz++  LInnerLoop64    ; enter loop if another chunk to go
 273
 274         b       LAddLastChunk   ; only one chunk
 275
 276 ;       Inner loop for 64-bit processors.  This loop is scheduled for the 970.
 277 ;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
 278 ;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
 279
 280         .align  5
 281 LInnerLoop64:                   ; 64 bytes/iteration
 282         add     r13,r13,r4      ; cycle 1
 283         add     r14,r14,r6
 284         dcbt    r3,r0           ; touch in 2 lines ahead
 285         lwz     r4,0(r3)
 286
 287         add     r15,r15,r7      ; cycle 2, etc
 288         lwz     r6,4(r3)
 289         lwz     r7,8(r3)
 290         add     r16,r16,r8
 291
 292         lwz     r8,12(r3)
 293         add     r13,r13,r9
 294         add     r14,r14,r10
 295         lwz     r9,16(r3)
 296
 297         add     r15,r15,r11
 298         lwz     r10,20(r3)
 299         lwz     r11,24(r3)
 300         add     r16,r16,r12
 301         bdz--   LEarlyExit      ; early exit if no more chunks
 302
 303         lwz     r12,28(r3)
 304         add     r13,r13,r4
 305         add     r14,r14,r6
 306         lwz     r4,32(r3)
 307
 308         add     r15,r15,r7
 309         lwz     r6,36(r3)
 310         lwz     r7,40(r3)
 311         add     r16,r16,r8
 312
 313         lwz     r8,44(r3)
 314         add     r13,r13,r9
 315         add     r14,r14,r10
 316         lwz     r9,48(r3)
 317
 318         add     r15,r15,r11
 319         lwz     r10,52(r3)
 320         lwz     r11,56(r3)
 321         add     r16,r16,r12
 322
 323         nop                     ; position last load in 2nd dispatch slot
 324         lwz     r12,60(r3)
 325         addi    r3,r3,64
 326         bdnz++  LInnerLoop64
 327
 328         b       LAddLastChunk
 329
 330 ;       Add in the last 32-byte chunk, and any leftover bytes.
 331 ;           r3 = word aligned address of next byte of data
 332 ;           r5 = accumulated sum parameter
 333 ;      r13-r16 = the four accumulators
 334 ;       cr1_gt = "starting on odd address" flag
 335 ;      cr6,cr7 = residual length
 336
 337 LEarlyExit:                     ; here from middle of inner loop
 338         lwz     r12,28(r3)      ; load last word of last chunk
 339         addi    r3,r3,32
 340 LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
 341         add     r13,r13,r4      ; add in last chunk
 342         add     r14,r14,r6      ; these are 64-bit adds
 343         add     r15,r15,r7
 344         add     r16,r16,r8
 345         add     r13,r13,r9
 346         add     r14,r14,r10
 347         add     r15,r15,r11
 348         add     r16,r16,r12
 349
 350 ;       Handle leftover bytes, if any.
 351
 352         bf      27,Lleft1       ; test 0x10 bit of residual length
 353         lwz     r4,0(r3)
 354         lwz     r6,4(r3)
 355         lwz     r7,8(r3)
 356         lwz     r8,12(r3)
 357         addi    r3,r3,16
 358         add     r13,r13,r4
 359         add     r14,r14,r6
 360         add     r15,r15,r7
 361         add     r16,r16,r8
 362 Lleft1:
 363         bf      28,Lleft2
 364         lwz     r4,0(r3)
 365         lwz     r6,4(r3)
 366         addi    r3,r3,8
 367         add     r13,r13,r4
 368         add     r14,r14,r6
 369 Lleft2:
 370         bf      29,Lleft3
 371         lwz     r4,0(r3)
 372         addi    r3,r3,4
 373         add     r14,r14,r4
 374 Lleft3:
 375         bf      30,Lleft4
 376         lhz     r4,0(r3)
 377         addi    r3,r3,2
 378         add     r15,r15,r4
 379 Lleft4:
 380         bf      31,Lleft5
 381         lbz     r4,0(r3)
 382         slwi    r4,r4,8         ; shift last byte into proper lane
 383         add     r16,r16,r4
 384
 385 ;       All data bytes have been checksummed.  Now we must add together the four
 386 ;       accumulators and restore the regs from the red zone.
 387 ;           r3 = word aligned address of next byte of data
 388 ;           r5 = accumulated sum parameter
 389 ;      r13-r16 = the four accumulators
 390 ;        carry = not used so far
 391 ;       cr1_gt = "starting on odd address" flag
 392
 393 Lleft5:
 394         add     r8,r13,r14      ; add the four accumulators together
 395         add     r9,r15,r16
 396         lwz     r13,-4(r1)      ; start to restore nonvolatiles from red zone
 397         lwz     r14,-8(r1)
 398         add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
 399         lwz     r15,-12(r1)
 400         lwz     r16,-16(r1)
 401         srdi    r7,r8,32        ; get upper half of 64-bit sum
 402         addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)
 403         b       Lwrapup         ; merge r2, r5, and carry into a 16-bit checksum