bsd/dev/ppc/xsumas.s

   1 /*
   2  * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #define kShort  11
  24 #define cr1_gt  5       // bit 1 of cr1
  25
  26 /*
  27  * short xsum_assym( short *p, int len, short xsum, boolean odd);
  28  *
  29  *  r3 - Pointer to data
  30  *  r4 - Length of data
  31  *  r5 - Accumulated sum value
  32  *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
  33  *
  34  * Note: If the "odd" flag is set, the address in r3 will be even.  Nonetheless, we
  35  *       correctly handle the case where the flag is set and the address is odd.
  36  *
  37  * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
  38  * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
  39  * via "add with carry" operations on a 2s-complement machine like PPC.  Note that
  40  * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
  41  * final sum is folded down to 16 bits.  On 32-bit machines we use "adde", which is
  42  * perfect except that it serializes the adds on the carry bit.  On 64-bit machines
  43  * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
  44  * all 64-bits into a 16-bit sum at the end.  We cannot use "adde" on 64-bit sums,
  45  * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
  46  * is set on the low 32-bits of the sum.)
  47  *
  48  * Using Altivec is tempting, but the performance impact of the greatly increased
  49  * number of exceptions and register save/restore traffic probably make it impractical
  50  * for now.
  51  */
  52         .globl  _xsum_assym
  53         .globl  _xsum_nop_if_32bit
  54         .text
  55         .align  5
  56 _xsum_assym:
  57         cmplwi  cr0,r4,kShort   ; too short to word align?
  58         rlwinm  r2,r3,0,0x3     ; get byte offset in word
  59         dcbt    0,r3            ; touch in 1st cache line
  60         cmpwi   cr6,r2,0        ; is address word aligned?
  61         ble     cr0,Lshort      ; skip if too short to bother aligning
  62
  63         subfic  r0,r2,4         ; get #bytes in partial word
  64         cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
  65         addic   r0,r0,0         ; turn off carry
  66         beq     cr6,Laligned    ; skip if already word aligned (r2==0 if aligned)
  67
  68 ;       Partial word at start: zero filled on left, it becomes initial checksum.
  69
  70         rlwinm  r3,r3,0,0,29    ; word align address
  71         mtcrf   0x01,r2         ; move byte offset to cr7
  72         lwz     r6,0(r3)        ; get partial word
  73         li      r7,-1           ; start of mask for partial fill
  74         slwi    r8,r2,3         ; multiply byte offset by 8
  75         sub     r4,r4,r0        ; adjust length for bytes in partial word
  76         crxor   cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
  77         srw     r7,r7,r8        ; get mask for bytes to keep in partial word
  78         addi    r3,r3,4         ; point to next word of input
  79         and     r2,r6,r7        ; zero fill on left
  80
  81 ;       Address is now word aligned.  Prepare for inner loop over 32-byte chunks.
  82 ;           r2 = initial checksum
  83 ;           r3 = word aligned address
  84 ;           r4 = length remaining
  85 ;           r5 = accumulated sum parameter
  86 ;        carry = off
  87 ;       cr1_gt = "starting on odd address" flag
  88
  89 Laligned:
  90         srwi.   r0,r4,5         ; get count of 32-byte chunks
  91         mtcrf   0x02,r4         ; move residual length to cr6 and cr7
  92         mtcrf   0x01,r4
  93         beq     cr0,Lleftovers  ; no chunks
  94
  95         mtctr   r0              ; set up loop count
  96         li      r4,32           ; offset to next chunk
  97 _xsum_nop_if_32bit:
  98         b       L64BitPath      ; use the 64-bit path (patched to nop on 32-bit machine)
  99         dcbt    r4,r3           ; touch in 2nd cache line
 100         li      r0,96           ; get touch offset
 101         b       LInnerLoop32    ; enter 32-bit loop
 102
 103 ;       Inner loop for 32-bit machines.
 104
 105         .align  4
 106 LInnerLoop32:
 107         lwz     r4,0(r3)
 108         lwz     r6,4(r3)
 109         lwz     r7,8(r3)
 110         lwz     r8,12(r3)
 111         adde    r2,r2,r4
 112         lwz     r9,16(r3)
 113         adde    r2,r2,r6
 114         lwz     r10,20(r3)
 115         adde    r2,r2,r7
 116         lwz     r11,24(r3)
 117         adde    r2,r2,r8
 118         lwz     r12,28(r3)
 119         adde    r2,r2,r9
 120         dcbt    r3,r0
 121         adde    r2,r2,r10
 122         addi    r3,r3,32
 123         adde    r2,r2,r11
 124         adde    r2,r2,r12
 125         bdnz+   LInnerLoop32
 126
 127 ;       Handle leftover bytes.
 128 ;           r2 = checksum so far
 129 ;           r3 = word aligned address
 130 ;           r5 = accumulated sum parameter
 131 ;        carry = live
 132 ;       cr1_gt = "starting on odd address" flag
 133 ;      cr6,cr7 = residual length
 134
 135 Lleftovers:
 136         bf      27,Lleftover8   ; test 0x10 bit of residual length
 137         lwz     r4,0(r3)
 138         lwz     r6,4(r3)
 139         lwz     r7,8(r3)
 140         lwz     r8,12(r3)
 141         addi    r3,r3,16
 142         adde    r2,r2,r4
 143         adde    r2,r2,r6
 144         adde    r2,r2,r7
 145         adde    r2,r2,r8
 146 Lleftover8:
 147         bf      28,Lleftover4
 148         lwz     r4,0(r3)
 149         lwz     r6,4(r3)
 150         addi    r3,r3,8
 151         adde    r2,r2,r4
 152         adde    r2,r2,r6
 153 Lleftover4:
 154         bf      29,Lleftover2
 155         lwz     r4,0(r3)
 156         addi    r3,r3,4
 157         adde    r2,r2,r4
 158 Lleftover2:
 159         bf      30,Lleftover1
 160         lhz     r4,0(r3)
 161         addi    r3,r3,2
 162         adde    r2,r2,r4
 163 Lleftover1:
 164         bf      31,Lwrapup
 165         lbz     r4,0(r3)
 166         slwi    r4,r4,8         ; shift last byte into proper lane
 167         adde    r2,r2,r4
 168
 169 ;       All data bytes checksummed.  Wrap up.
 170 ;           r2 = checksum so far (word parallel)
 171 ;           r5 = accumulated sum parameter
 172 ;        carry = live
 173 ;       cr1_gt = "starting on odd address" flag
 174
 175 Lwrapup:
 176         addze   r2,r2           ; add in last carry
 177         addze   r2,r2           ; in case the "addze" carries
 178 Lwrapupx:                       ; here from short-operand case, with xer(ca) undefined
 179         srwi    r6,r2,16        ; top half of 32-bit checksum
 180         rlwinm  r7,r2,0,0xFFFF  ; lower half
 181         add     r2,r6,r7        ; add them together
 182         srwi    r6,r2,16        ; then do it again, in case first carried
 183         rlwinm  r7,r2,0,0xFFFF
 184         add     r2,r6,r7
 185         bf      cr1_gt,Lswapped ; test "starting on odd address" flag
 186
 187 ;       The checksum began on an odd address, so swap bytes.
 188
 189         rlwinm  r6,r2,24,0x00FF ; move top byte to bottom
 190         rlwinm  r7,r2,8,0xFF00  ; bottom to top
 191         or      r2,r6,r7        ; rejoin
 192
 193 ;       Finally, add in checksum passed in as a parameter.
 194
 195 Lswapped:
 196         add     r2,r2,r5        ; add passed-in checksum
 197         srwi    r6,r2,16        ; top half of 32-bit checksum
 198         rlwinm  r7,r2,0,0xFFFF  ; lower half
 199         add     r2,r6,r7        ; add them together
 200         srwi    r6,r2,16        ; then do it again, in case first carried
 201         rlwinm  r7,r2,0,0xFFFF
 202         add     r3,r6,r7        ; steer result into r3
 203         blr
 204
 205 ;       Handle short operands.  Do a halfword at a time.
 206 ;           r3 = address
 207 ;           r4 = length (<= kShort)
 208 ;           r5 = accumulated sum parameter
 209 ;           r6 = "starting on odd byte" flag
 210
 211 Lshort:
 212         cmpwi   cr6,r4,2        ; at least two bytes?
 213         andi.   r0,r4,1         ; odd length?
 214         li      r2,0            ; initialize checksum
 215         cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
 216         blt     cr6,Lshort2     ; fewer than two bytes, so skip
 217 Lshort1:
 218         cmpwi   cr6,r4,4        ; two more bytes (after we decrement)?
 219         lhz     r7,0(r3)
 220         subi    r4,r4,2
 221         addi    r3,r3,2
 222         add     r2,r2,r7        ; note no need for "adde"
 223         bge     cr6,Lshort1     ; loop for 2 more bytes
 224 Lshort2:
 225         beq     Lwrapupx        ; no byte at end, proceed to checkout with carry undefined
 226         lbz     r7,0(r3)
 227         slwi    r7,r7,8         ; shift last byte into proper lane
 228         add     r2,r2,r7
 229         b       Lwrapupx
 230
 231 ;       Handle 64-bit machine.  The major improvement over the 32-bit path is that we use
 232 ;       four parallel 32-bit accumulators, which carry into the upper half naturally so we
 233 ;       do not have to use "adde", which serializes on the carry bit.  Note that we cannot
 234 ;       do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
 235 ;           r2 = checksum so far (ie, the zero-filled partial first word)
 236 ;           r3 = word aligned address
 237 ;           r5 = accumulated sum parameter
 238 ;          ctr = number of 32-byte chunks of input
 239 ;        carry = unused in this code
 240 ;       cr1_gt = "starting on odd address" flag
 241 ;      cr6,cr7 = residual length
 242
 243 L64BitPath:
 244         stw     r13,-4(r1)      ; save a few nonvolatile regs in red zone so we can use them
 245         stw     r14,-8(r1)
 246         stw     r15,-12(r1)
 247         stw     r16,-16(r1)
 248         li      r0,128          ; to touch next line
 249         li      r13,0           ; r13-r15 are the accumulators, so initialize them
 250         dcbt    r3,r0           ; touch in next cache line, and keep loads away from the above stores
 251         lwz     r4,0(r3)        ; start pipeline by loading first 32 bytes into r4, r6-r12
 252         lwz     r6,4(r3)
 253         lwz     r7,8(r3)
 254         mr      r14,r2          ; just copy incoming partial word into one of the accumulators
 255         li      r15,0
 256         lwz     r8,12(r3)
 257         lwz     r9,16(r3)
 258         li      r16,0
 259         li      r0,256          ; get touch offset
 260         lwz     r10,20(r3)
 261         lwz     r11,24(r3)
 262         lwz     r12,28(r3)      ; load last word of previous chunk
 263         addi    r3,r3,32        ; skip past the chunk
 264         bdnz++  LInnerLoop64    ; enter loop if another chunk to go
 265
 266         b       LAddLastChunk   ; only one chunk
 267
 268 ;       Inner loop for 64-bit processors.  This loop is scheduled for the 970.
 269 ;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
 270 ;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
 271
 272         .align  5
 273 LInnerLoop64:                   ; 64 bytes/iteration
 274         add     r13,r13,r4      ; cycle 1
 275         add     r14,r14,r6
 276         dcbt    r3,r0           ; touch in 2 lines ahead
 277         lwz     r4,0(r3)
 278
 279         add     r15,r15,r7      ; cycle 2, etc
 280         lwz     r6,4(r3)
 281         lwz     r7,8(r3)
 282         add     r16,r16,r8
 283
 284         lwz     r8,12(r3)
 285         add     r13,r13,r9
 286         add     r14,r14,r10
 287         lwz     r9,16(r3)
 288
 289         add     r15,r15,r11
 290         lwz     r10,20(r3)
 291         lwz     r11,24(r3)
 292         add     r16,r16,r12
 293         bdz--   LEarlyExit      ; early exit if no more chunks
 294
 295         lwz     r12,28(r3)
 296         add     r13,r13,r4
 297         add     r14,r14,r6
 298         lwz     r4,32(r3)
 299
 300         add     r15,r15,r7
 301         lwz     r6,36(r3)
 302         lwz     r7,40(r3)
 303         add     r16,r16,r8
 304
 305         lwz     r8,44(r3)
 306         add     r13,r13,r9
 307         add     r14,r14,r10
 308         lwz     r9,48(r3)
 309
 310         add     r15,r15,r11
 311         lwz     r10,52(r3)
 312         lwz     r11,56(r3)
 313         add     r16,r16,r12
 314
 315         nop                     ; position last load in 2nd dispatch slot
 316         lwz     r12,60(r3)
 317         addi    r3,r3,64
 318         bdnz++  LInnerLoop64
 319
 320         b       LAddLastChunk
 321
 322 ;       Add in the last 32-byte chunk, and any leftover bytes.
 323 ;           r3 = word aligned address of next byte of data
 324 ;           r5 = accumulated sum parameter
 325 ;      r13-r16 = the four accumulators
 326 ;       cr1_gt = "starting on odd address" flag
 327 ;      cr6,cr7 = residual length
 328
 329 LEarlyExit:                     ; here from middle of inner loop
 330         lwz     r12,28(r3)      ; load last word of last chunk
 331         addi    r3,r3,32
 332 LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
 333         add     r13,r13,r4      ; add in last chunk
 334         add     r14,r14,r6      ; these are 64-bit adds
 335         add     r15,r15,r7
 336         add     r16,r16,r8
 337         add     r13,r13,r9
 338         add     r14,r14,r10
 339         add     r15,r15,r11
 340         add     r16,r16,r12
 341
 342 ;       Handle leftover bytes, if any.
 343
 344         bf      27,Lleft1       ; test 0x10 bit of residual length
 345         lwz     r4,0(r3)
 346         lwz     r6,4(r3)
 347         lwz     r7,8(r3)
 348         lwz     r8,12(r3)
 349         addi    r3,r3,16
 350         add     r13,r13,r4
 351         add     r14,r14,r6
 352         add     r15,r15,r7
 353         add     r16,r16,r8
 354 Lleft1:
 355         bf      28,Lleft2
 356         lwz     r4,0(r3)
 357         lwz     r6,4(r3)
 358         addi    r3,r3,8
 359         add     r13,r13,r4
 360         add     r14,r14,r6
 361 Lleft2:
 362         bf      29,Lleft3
 363         lwz     r4,0(r3)
 364         addi    r3,r3,4
 365         add     r14,r14,r4
 366 Lleft3:
 367         bf      30,Lleft4
 368         lhz     r4,0(r3)
 369         addi    r3,r3,2
 370         add     r15,r15,r4
 371 Lleft4:
 372         bf      31,Lleft5
 373         lbz     r4,0(r3)
 374         slwi    r4,r4,8         ; shift last byte into proper lane
 375         add     r16,r16,r4
 376
 377 ;       All data bytes have been checksummed.  Now we must add together the four
 378 ;       accumulators and restore the regs from the red zone.
 379 ;           r3 = word aligned address of next byte of data
 380 ;           r5 = accumulated sum parameter
 381 ;      r13-r16 = the four accumulators
 382 ;        carry = not used so far
 383 ;       cr1_gt = "starting on odd address" flag
 384
 385 Lleft5:
 386         add     r8,r13,r14      ; add the four accumulators together
 387         add     r9,r15,r16
 388         lwz     r13,-4(r1)      ; start to restore nonvolatiles from red zone
 389         lwz     r14,-8(r1)
 390         add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
 391         lwz     r15,-12(r1)
 392         lwz     r16,-16(r1)
 393         srdi    r7,r8,32        ; get upper half of 64-bit sum
 394         addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)
 395         b       Lwrapup         ; merge r2, r5, and carry into a 16-bit checksum