bsd/dev/ppc/xsumas.s

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #define kShort  11
  30 #define cr1_gt  5       // bit 1 of cr1
  31
  32 /*
  33  * short xsum_assym( short *p, int len, short xsum, boolean odd);
  34  *
  35  *  r3 - Pointer to data
  36  *  r4 - Length of data
  37  *  r5 - Accumulated sum value
  38  *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
  39  *
  40  * Note: If the "odd" flag is set, the address in r3 will be even.  Nonetheless, we
  41  *       correctly handle the case where the flag is set and the address is odd.
  42  *
  43  * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
  44  * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
  45  * via "add with carry" operations on a 2s-complement machine like PPC.  Note that
  46  * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
  47  * final sum is folded down to 16 bits.  On 32-bit machines we use "adde", which is
  48  * perfect except that it serializes the adds on the carry bit.  On 64-bit machines
  49  * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
  50  * all 64-bits into a 16-bit sum at the end.  We cannot use "adde" on 64-bit sums,
  51  * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
  52  * is set on the low 32-bits of the sum.)
  53  *
  54  * Using Altivec is tempting, but the performance impact of the greatly increased
  55  * number of exceptions and register save/restore traffic probably make it impractical
  56  * for now.
  57  */
  58         .globl  _xsum_assym
  59         .globl  _xsum_nop_if_32bit
  60         .text
  61         .align  5
  62 _xsum_assym:
  63         cmplwi  cr0,r4,kShort   ; too short to word align?
  64         rlwinm  r2,r3,0,0x3     ; get byte offset in word
  65         dcbt    0,r3            ; touch in 1st cache line
  66         cmpwi   cr6,r2,0        ; is address word aligned?
  67         ble     cr0,Lshort      ; skip if too short to bother aligning
  68
  69         subfic  r0,r2,4         ; get #bytes in partial word
  70         cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
  71         addic   r0,r0,0         ; turn off carry
  72         beq     cr6,Laligned    ; skip if already word aligned (r2==0 if aligned)
  73
  74 ;       Partial word at start: zero filled on left, it becomes initial checksum.
  75
  76         rlwinm  r3,r3,0,0,29    ; word align address
  77         mtcrf   0x01,r2         ; move byte offset to cr7
  78         lwz     r6,0(r3)        ; get partial word
  79         li      r7,-1           ; start of mask for partial fill
  80         slwi    r8,r2,3         ; multiply byte offset by 8
  81         sub     r4,r4,r0        ; adjust length for bytes in partial word
  82         crxor   cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
  83         srw     r7,r7,r8        ; get mask for bytes to keep in partial word
  84         addi    r3,r3,4         ; point to next word of input
  85         and     r2,r6,r7        ; zero fill on left
  86
  87 ;       Address is now word aligned.  Prepare for inner loop over 32-byte chunks.
  88 ;           r2 = initial checksum
  89 ;           r3 = word aligned address
  90 ;           r4 = length remaining
  91 ;           r5 = accumulated sum parameter
  92 ;        carry = off
  93 ;       cr1_gt = "starting on odd address" flag
  94
  95 Laligned:
  96         srwi.   r0,r4,5         ; get count of 32-byte chunks
  97         mtcrf   0x02,r4         ; move residual length to cr6 and cr7
  98         mtcrf   0x01,r4
  99         beq     cr0,Lleftovers  ; no chunks
 100
 101         mtctr   r0              ; set up loop count
 102         li      r4,32           ; offset to next chunk
 103 _xsum_nop_if_32bit:
 104         b       L64BitPath      ; use the 64-bit path (patched to nop on 32-bit machine)
 105         dcbt    r4,r3           ; touch in 2nd cache line
 106         li      r0,96           ; get touch offset
 107         b       LInnerLoop32    ; enter 32-bit loop
 108
 109 ;       Inner loop for 32-bit machines.
 110
 111         .align  4
 112 LInnerLoop32:
 113         lwz     r4,0(r3)
 114         lwz     r6,4(r3)
 115         lwz     r7,8(r3)
 116         lwz     r8,12(r3)
 117         adde    r2,r2,r4
 118         lwz     r9,16(r3)
 119         adde    r2,r2,r6
 120         lwz     r10,20(r3)
 121         adde    r2,r2,r7
 122         lwz     r11,24(r3)
 123         adde    r2,r2,r8
 124         lwz     r12,28(r3)
 125         adde    r2,r2,r9
 126         dcbt    r3,r0
 127         adde    r2,r2,r10
 128         addi    r3,r3,32
 129         adde    r2,r2,r11
 130         adde    r2,r2,r12
 131         bdnz+   LInnerLoop32
 132
 133 ;       Handle leftover bytes.
 134 ;           r2 = checksum so far
 135 ;           r3 = word aligned address
 136 ;           r5 = accumulated sum parameter
 137 ;        carry = live
 138 ;       cr1_gt = "starting on odd address" flag
 139 ;      cr6,cr7 = residual length
 140
 141 Lleftovers:
 142         bf      27,Lleftover8   ; test 0x10 bit of residual length
 143         lwz     r4,0(r3)
 144         lwz     r6,4(r3)
 145         lwz     r7,8(r3)
 146         lwz     r8,12(r3)
 147         addi    r3,r3,16
 148         adde    r2,r2,r4
 149         adde    r2,r2,r6
 150         adde    r2,r2,r7
 151         adde    r2,r2,r8
 152 Lleftover8:
 153         bf      28,Lleftover4
 154         lwz     r4,0(r3)
 155         lwz     r6,4(r3)
 156         addi    r3,r3,8
 157         adde    r2,r2,r4
 158         adde    r2,r2,r6
 159 Lleftover4:
 160         bf      29,Lleftover2
 161         lwz     r4,0(r3)
 162         addi    r3,r3,4
 163         adde    r2,r2,r4
 164 Lleftover2:
 165         bf      30,Lleftover1
 166         lhz     r4,0(r3)
 167         addi    r3,r3,2
 168         adde    r2,r2,r4
 169 Lleftover1:
 170         bf      31,Lwrapup
 171         lbz     r4,0(r3)
 172         slwi    r4,r4,8         ; shift last byte into proper lane
 173         adde    r2,r2,r4
 174
 175 ;       All data bytes checksummed.  Wrap up.
 176 ;           r2 = checksum so far (word parallel)
 177 ;           r5 = accumulated sum parameter
 178 ;        carry = live
 179 ;       cr1_gt = "starting on odd address" flag
 180
 181 Lwrapup:
 182         addze   r2,r2           ; add in last carry
 183         addze   r2,r2           ; in case the "addze" carries
 184 Lwrapupx:                       ; here from short-operand case, with xer(ca) undefined
 185         srwi    r6,r2,16        ; top half of 32-bit checksum
 186         rlwinm  r7,r2,0,0xFFFF  ; lower half
 187         add     r2,r6,r7        ; add them together
 188         srwi    r6,r2,16        ; then do it again, in case first carried
 189         rlwinm  r7,r2,0,0xFFFF
 190         add     r2,r6,r7
 191         bf      cr1_gt,Lswapped ; test "starting on odd address" flag
 192
 193 ;       The checksum began on an odd address, so swap bytes.
 194
 195         rlwinm  r6,r2,24,0x00FF ; move top byte to bottom
 196         rlwinm  r7,r2,8,0xFF00  ; bottom to top
 197         or      r2,r6,r7        ; rejoin
 198
 199 ;       Finally, add in checksum passed in as a parameter.
 200
 201 Lswapped:
 202         add     r2,r2,r5        ; add passed-in checksum
 203         srwi    r6,r2,16        ; top half of 32-bit checksum
 204         rlwinm  r7,r2,0,0xFFFF  ; lower half
 205         add     r2,r6,r7        ; add them together
 206         srwi    r6,r2,16        ; then do it again, in case first carried
 207         rlwinm  r7,r2,0,0xFFFF
 208         add     r3,r6,r7        ; steer result into r3
 209         blr
 210
 211 ;       Handle short operands.  Do a halfword at a time.
 212 ;           r3 = address
 213 ;           r4 = length (<= kShort)
 214 ;           r5 = accumulated sum parameter
 215 ;           r6 = "starting on odd byte" flag
 216
 217 Lshort:
 218         cmpwi   cr6,r4,2        ; at least two bytes?
 219         andi.   r0,r4,1         ; odd length?
 220         li      r2,0            ; initialize checksum
 221         cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
 222         blt     cr6,Lshort2     ; fewer than two bytes, so skip
 223 Lshort1:
 224         cmpwi   cr6,r4,4        ; two more bytes (after we decrement)?
 225         lhz     r7,0(r3)
 226         subi    r4,r4,2
 227         addi    r3,r3,2
 228         add     r2,r2,r7        ; note no need for "adde"
 229         bge     cr6,Lshort1     ; loop for 2 more bytes
 230 Lshort2:
 231         beq     Lwrapupx        ; no byte at end, proceed to checkout with carry undefined
 232         lbz     r7,0(r3)
 233         slwi    r7,r7,8         ; shift last byte into proper lane
 234         add     r2,r2,r7
 235         b       Lwrapupx
 236
 237 ;       Handle 64-bit machine.  The major improvement over the 32-bit path is that we use
 238 ;       four parallel 32-bit accumulators, which carry into the upper half naturally so we
 239 ;       do not have to use "adde", which serializes on the carry bit.  Note that we cannot
 240 ;       do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
 241 ;           r2 = checksum so far (ie, the zero-filled partial first word)
 242 ;           r3 = word aligned address
 243 ;           r5 = accumulated sum parameter
 244 ;          ctr = number of 32-byte chunks of input
 245 ;        carry = unused in this code
 246 ;       cr1_gt = "starting on odd address" flag
 247 ;      cr6,cr7 = residual length
 248
 249 L64BitPath:
 250         stw     r13,-4(r1)      ; save a few nonvolatile regs in red zone so we can use them
 251         stw     r14,-8(r1)
 252         stw     r15,-12(r1)
 253         stw     r16,-16(r1)
 254         li      r0,128          ; to touch next line
 255         li      r13,0           ; r13-r15 are the accumulators, so initialize them
 256         dcbt    r3,r0           ; touch in next cache line, and keep loads away from the above stores
 257         lwz     r4,0(r3)        ; start pipeline by loading first 32 bytes into r4, r6-r12
 258         lwz     r6,4(r3)
 259         lwz     r7,8(r3)
 260         mr      r14,r2          ; just copy incoming partial word into one of the accumulators
 261         li      r15,0
 262         lwz     r8,12(r3)
 263         lwz     r9,16(r3)
 264         li      r16,0
 265         li      r0,256          ; get touch offset
 266         lwz     r10,20(r3)
 267         lwz     r11,24(r3)
 268         lwz     r12,28(r3)      ; load last word of previous chunk
 269         addi    r3,r3,32        ; skip past the chunk
 270         bdnz++  LInnerLoop64    ; enter loop if another chunk to go
 271
 272         b       LAddLastChunk   ; only one chunk
 273
 274 ;       Inner loop for 64-bit processors.  This loop is scheduled for the 970.
 275 ;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
 276 ;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
 277
 278         .align  5
 279 LInnerLoop64:                   ; 64 bytes/iteration
 280         add     r13,r13,r4      ; cycle 1
 281         add     r14,r14,r6
 282         dcbt    r3,r0           ; touch in 2 lines ahead
 283         lwz     r4,0(r3)
 284
 285         add     r15,r15,r7      ; cycle 2, etc
 286         lwz     r6,4(r3)
 287         lwz     r7,8(r3)
 288         add     r16,r16,r8
 289
 290         lwz     r8,12(r3)
 291         add     r13,r13,r9
 292         add     r14,r14,r10
 293         lwz     r9,16(r3)
 294
 295         add     r15,r15,r11
 296         lwz     r10,20(r3)
 297         lwz     r11,24(r3)
 298         add     r16,r16,r12
 299         bdz--   LEarlyExit      ; early exit if no more chunks
 300
 301         lwz     r12,28(r3)
 302         add     r13,r13,r4
 303         add     r14,r14,r6
 304         lwz     r4,32(r3)
 305
 306         add     r15,r15,r7
 307         lwz     r6,36(r3)
 308         lwz     r7,40(r3)
 309         add     r16,r16,r8
 310
 311         lwz     r8,44(r3)
 312         add     r13,r13,r9
 313         add     r14,r14,r10
 314         lwz     r9,48(r3)
 315
 316         add     r15,r15,r11
 317         lwz     r10,52(r3)
 318         lwz     r11,56(r3)
 319         add     r16,r16,r12
 320
 321         nop                     ; position last load in 2nd dispatch slot
 322         lwz     r12,60(r3)
 323         addi    r3,r3,64
 324         bdnz++  LInnerLoop64
 325
 326         b       LAddLastChunk
 327
 328 ;       Add in the last 32-byte chunk, and any leftover bytes.
 329 ;           r3 = word aligned address of next byte of data
 330 ;           r5 = accumulated sum parameter
 331 ;      r13-r16 = the four accumulators
 332 ;       cr1_gt = "starting on odd address" flag
 333 ;      cr6,cr7 = residual length
 334
 335 LEarlyExit:                     ; here from middle of inner loop
 336         lwz     r12,28(r3)      ; load last word of last chunk
 337         addi    r3,r3,32
 338 LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
 339         add     r13,r13,r4      ; add in last chunk
 340         add     r14,r14,r6      ; these are 64-bit adds
 341         add     r15,r15,r7
 342         add     r16,r16,r8
 343         add     r13,r13,r9
 344         add     r14,r14,r10
 345         add     r15,r15,r11
 346         add     r16,r16,r12
 347
 348 ;       Handle leftover bytes, if any.
 349
 350         bf      27,Lleft1       ; test 0x10 bit of residual length
 351         lwz     r4,0(r3)
 352         lwz     r6,4(r3)
 353         lwz     r7,8(r3)
 354         lwz     r8,12(r3)
 355         addi    r3,r3,16
 356         add     r13,r13,r4
 357         add     r14,r14,r6
 358         add     r15,r15,r7
 359         add     r16,r16,r8
 360 Lleft1:
 361         bf      28,Lleft2
 362         lwz     r4,0(r3)
 363         lwz     r6,4(r3)
 364         addi    r3,r3,8
 365         add     r13,r13,r4
 366         add     r14,r14,r6
 367 Lleft2:
 368         bf      29,Lleft3
 369         lwz     r4,0(r3)
 370         addi    r3,r3,4
 371         add     r14,r14,r4
 372 Lleft3:
 373         bf      30,Lleft4
 374         lhz     r4,0(r3)
 375         addi    r3,r3,2
 376         add     r15,r15,r4
 377 Lleft4:
 378         bf      31,Lleft5
 379         lbz     r4,0(r3)
 380         slwi    r4,r4,8         ; shift last byte into proper lane
 381         add     r16,r16,r4
 382
 383 ;       All data bytes have been checksummed.  Now we must add together the four
 384 ;       accumulators and restore the regs from the red zone.
 385 ;           r3 = word aligned address of next byte of data
 386 ;           r5 = accumulated sum parameter
 387 ;      r13-r16 = the four accumulators
 388 ;        carry = not used so far
 389 ;       cr1_gt = "starting on odd address" flag
 390
 391 Lleft5:
 392         add     r8,r13,r14      ; add the four accumulators together
 393         add     r9,r15,r16
 394         lwz     r13,-4(r1)      ; start to restore nonvolatiles from red zone
 395         lwz     r14,-8(r1)
 396         add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
 397         lwz     r15,-12(r1)
 398         lwz     r16,-16(r1)
 399         srdi    r7,r8,32        ; get upper half of 64-bit sum
 400         addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)
 401         b       Lwrapup         ; merge r2, r5, and carry into a 16-bit checksum