bsd/dev/arm64/cpu_in_cksum.s

   1 /*
   2  * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
  31  * with __arm64__ tagged ARM64_TODO .  This code revision is optimized based
  32  * on the 64-bit part in netinet/cpu_in_cksum.c
  33  *
  34  * cclee - CoreOS - Vector & Numerics. 06/20/2012.
  35  */
  36
  37 #ifdef KERNEL
  38 #define CKSUM_ERR _kprintf
  39 #else
  40 #ifndef LIBSYSCALL_INTERFACE
  41 #error "LIBSYSCALL_INTERFACE not defined"
  42 #endif /* !LIBSYSCALL_INTERFACE */
  43 #define CKSUM_ERR _fprintf_stderr
  44 #endif /* !KERNEL */
  45
  46 /*
  47  * XXX: adi@apple.com:
  48  *
  49  * Ugly, but we have little choice, since relying on genassym and <assym.s>
  50  * is not possible unless this code lives in osfmk.  Note also that this
  51  * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
  52  * authentic; it only cares about 3 fields.
  53  */
  54 #if defined(__LP64__)
  55 #define M_NEXT  0
  56 #define M_DATA  16      // 8-byte address, would be aligned to 8-byte boundary
  57 #define M_LEN   24
  58 #else
  59 #define M_NEXT  0
  60 #define M_DATA  8
  61 #define M_LEN   12
  62 #endif
  63
  64         .globl  _os_cpu_in_cksum_mbuf
  65         .text
  66         .align  4
  67 _os_cpu_in_cksum_mbuf:
  68
  69
  70 /*
  71  * 64-bit version.
  72  *
  73  * This function returns the partial 16-bit checksum accumulated in
  74  * a 32-bit variable (withouth 1's complement); caller is responsible
  75  * for folding the 32-bit sum into 16-bit and performinng the 1's
  76  * complement if applicable
  77  */
  78
  79 /*
  80  * uint32_t
  81  * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
  82  * {
  83  *      int mlen;
  84  *      uint64_t sum, partial;
  85  *      unsigned int final_acc;
  86  *      uint8_t *data;
  87  *      boolean_t needs_swap, started_on_odd;
  88  *
  89  *      VERIFY(len >= 0);
  90  *      VERIFY(off >= 0);
  91  *
  92  *      needs_swap = FALSE;
  93  *      started_on_odd = FALSE;
  94  *      sum = initial_sum;
  95  */
  96
  97         #define m               x0
  98         #define len             x1
  99         #define off             x2
 100         #define sum             x3
 101         #define needs_swap      x4
 102         #define started_on_odd  x5
 103         #define mlen                    x6
 104         #define Wmlen                   w6
 105         #define t       x7
 106         #define data    x8
 107 #if defined(__LP64__)
 108         #define ptr_m           x0
 109         #define ptr_data        x8
 110 #else
 111         #define ptr_m           w0
 112         #define ptr_data        w8
 113 #endif
 114
 115
 116         mov     needs_swap, #0          // needs_swap = FALSE;
 117         mov     started_on_odd, #0      // started_on_odd = FALSE;
 118         mov     w3, w3                  // clear higher half
 119
 120
 121 /*
 122  *      for (;;) {
 123  *              if (PREDICT_FALSE(m == NULL)) {
 124  *                      CKSUM_ERR("%s: out of data\n", __func__);
 125  *                      return (-1);
 126  *              }
 127  *              mlen = m->m_len;
 128  *              if (mlen > off) {
 129  *                      mlen -= off;
 130  *                      data = mtod(m, uint8_t *) + off;
 131  *                      goto post_initial_offset;
 132  *              }
 133  *              off -= mlen;
 134  *              if (len == 0)
 135  *                      break;
 136  *              m = m->m_next;
 137  *      }
 138  */
 139
 140 0:
 141         cbz     m, Lin_cksum_whoops     // if (m == NULL) return -1;
 142         ldr     Wmlen, [m, #M_LEN]      // mlen = m->m_len;
 143         cmp     mlen, off
 144         b.le    1f
 145         ldr     ptr_data, [m, #M_DATA]  // mtod(m, uint8_t *)
 146         sub     mlen, mlen, off         // mlen -= off;
 147         add     data, data, off         // data = mtod(m, uint8_t *) + off;
 148         b       L_post_initial_offset
 149 1:
 150         sub     off, off, mlen
 151         cbnz    len, 2f
 152         mov     x0, x3
 153         ret     lr
 154 2:
 155         ldr     ptr_m, [m, #M_NEXT]
 156         b       0b
 157
 158 L_loop: // for (; len > 0; m = m->m_next) {
 159 /*
 160  *              if (PREDICT_FALSE(m == NULL)) {
 161  *                      CKSUM_ERR("%s: out of data\n", __func__);
 162  *                      return (-1);
 163  *              }
 164  *              mlen = m->m_len;
 165  *              data = mtod(m, uint8_t *);
 166  */
 167         cbz     m, Lin_cksum_whoops     // if (m == NULL) return -1;
 168         ldr     Wmlen, [m, #M_LEN]      // mlen = m->m_len;
 169         ldr     ptr_data, [m, #M_DATA]  // mtod(m, uint8_t *)
 170
 171 L_post_initial_offset:
 172 /*
 173  *              if (mlen == 0) continue;
 174  *              if (mlen > len) mlen = len;
 175  *              len -= mlen;
 176  */
 177
 178         cbz     mlen, L_continue
 179         cmp     mlen, len
 180         csel    mlen, mlen, len, le
 181         sub     len, len, mlen
 182
 183 /*
 184  *              partial = 0;
 185  *              if ((uintptr_t)data & 1) {
 186  *                      started_on_odd = !started_on_odd;
 187  *                      partial = *data << 8;
 188  *                      ++data;
 189  *                      --mlen;
 190  *              }
 191  *              needs_swap = started_on_odd;
 192  */
 193
 194         tst     data, #1
 195         mov     x7, #0
 196         mov     x10, #0
 197         b.eq    1f
 198         ldrb    w7, [data], #1
 199         eor     started_on_odd, started_on_odd, #1
 200         sub     mlen, mlen, #1
 201         lsl     w7, w7, #8
 202 1:
 203
 204
 205 /*
 206  *              if ((uintptr_t)data & 2) {
 207  *                      if (mlen < 2)
 208  *                              goto trailing_bytes;
 209  *                      partial += *(uint16_t *)(void *)data;
 210  *                      data += 2;
 211  *                      mlen -= 2;
 212  *              }
 213  */
 214         tst     data, #2
 215         mov     needs_swap, started_on_odd
 216         b.eq    1f
 217         cmp     mlen, #2
 218         b.lt    L_trailing_bytes
 219         ldrh    w9, [data], #2
 220         sub     mlen, mlen, #2
 221         add     w7, w7, w9
 222 1:
 223
 224 /*
 225  *              while (mlen >= 64) {
 226  *                      __builtin_prefetch(data + 32);
 227  *                      __builtin_prefetch(data + 64);
 228  *                      partial += *(uint32_t *)(void *)data;
 229  *                      partial += *(uint32_t *)(void *)(data + 4);
 230  *                      partial += *(uint32_t *)(void *)(data + 8);
 231  *                      partial += *(uint32_t *)(void *)(data + 12);
 232  *                      partial += *(uint32_t *)(void *)(data + 16);
 233  *                      partial += *(uint32_t *)(void *)(data + 20);
 234  *                      partial += *(uint32_t *)(void *)(data + 24);
 235  *                      partial += *(uint32_t *)(void *)(data + 28);
 236  *                      partial += *(uint32_t *)(void *)(data + 32);
 237  *                      partial += *(uint32_t *)(void *)(data + 36);
 238  *                      partial += *(uint32_t *)(void *)(data + 40);
 239  *                      partial += *(uint32_t *)(void *)(data + 44);
 240  *                      partial += *(uint32_t *)(void *)(data + 48);
 241  *                      partial += *(uint32_t *)(void *)(data + 52);
 242  *                      partial += *(uint32_t *)(void *)(data + 56);
 243  *                      partial += *(uint32_t *)(void *)(data + 60);
 244  *                      data += 64;
 245  *                      mlen -= 64;
 246  *              //      if (PREDICT_FALSE(partial & (3ULL << 62))) {
 247  *              //              if (needs_swap)
 248  *              //                      partial = (partial << 8) +
 249  *              //                          (partial >> 56);
 250  *              //              sum += (partial >> 32);
 251  *              //              sum += (partial & 0xffffffff);
 252  *              //              partial = 0;
 253  *              //      }
 254  *              }
 255 */
 256
 257         // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
 258         subs    mlen, mlen, #64
 259         b.lt    L32_bytes
 260
 261         // save used vector registers
 262         sub     sp, sp, #8*16
 263         mov     x11, sp
 264         st1.4s  {v0, v1, v2, v3}, [x11], #4*16
 265         st1.4s  {v4, v5, v6, v7}, [x11], #4*16
 266
 267         // spread partial into 8 8-byte registers in v0-v3
 268         fmov    s3, w7
 269         eor.16b v0, v0, v0
 270         eor.16b v1, v1, v1
 271         eor.16b v2, v2, v2
 272
 273         // load the 1st 64 bytes (16 32-bit words)
 274         ld1.4s  {v4,v5,v6,v7},[data],#64
 275
 276         // branch to finish off if mlen<64
 277         subs    mlen, mlen, #64
 278         b.lt    L64_finishup
 279
 280         /*
 281          * loop for loading and accumulating 16 32-bit words into
 282          * 8 8-byte accumulators per iteration.
 283          */
 284 L64_loop:
 285         subs        mlen, mlen, #64             // mlen -= 64
 286
 287         uadalp.2d   v0, v4
 288         ld1.4s      {v4},[data], #16
 289
 290         uadalp.2d   v1, v5
 291         ld1.4s      {v5},[data], #16
 292
 293         uadalp.2d   v2, v6
 294         ld1.4s      {v6},[data], #16
 295
 296         uadalp.2d   v3, v7
 297         ld1.4s      {v7},[data], #16
 298
 299         b.ge        L64_loop
 300
 301 L64_finishup:
 302         uadalp.2d   v0, v4
 303         uadalp.2d   v1, v5
 304         uadalp.2d   v2, v6
 305         uadalp.2d   v3, v7
 306
 307         add.2d      v0, v0, v1
 308         add.2d      v2, v2, v3
 309         addp.2d     d0, v0
 310         addp.2d     d2, v2
 311         add.2d      v0, v0, v2
 312         fmov        x7, d0                      // partial in x7 now
 313
 314         // restore used vector registers
 315         ld1.4s      {v0, v1, v2, v3}, [sp], #4*16
 316         ld1.4s      {v4, v5, v6, v7}, [sp], #4*16
 317
 318 L32_bytes:
 319         tst     mlen, #32
 320         b.eq    L16_bytes
 321         ldp     x9, x10, [data], #16
 322         ldp     x11, x12, [data], #16
 323         adds    x7, x7, x9
 324         mov     x9, #0
 325         adcs    x7, x7, x10
 326         adcs    x7, x7, x11
 327         adcs    x7, x7, x12
 328         adc     x7, x7, x9
 329
 330 L16_bytes:
 331         tst     mlen, #16
 332         b.eq    L8_bytes
 333         ldp     x9, x10, [data], #16
 334         adds    x7, x7, x9
 335         mov     x9, #0
 336         adcs    x7, x7, x10
 337         adc     x7, x7, x9
 338
 339 L8_bytes:
 340         tst     mlen, #8
 341         mov     x10, #0
 342         b.eq    L4_bytes
 343         ldr     x9,[data],#8
 344         adds    x7, x7, x9
 345         adc     x7, x7, x10
 346
 347 L4_bytes:
 348         tst     mlen, #4
 349         b.eq    L2_bytes
 350         ldr     w9,[data],#4
 351         adds    x7, x7, x9
 352         adc     x7, x7, x10
 353
 354 L2_bytes:
 355         tst     mlen, #2
 356         b.eq    L_trailing_bytes
 357         ldrh    w9,[data],#2
 358         adds    x7, x7, x9
 359         adc     x7, x7, x10
 360
 361 L_trailing_bytes:
 362         tst     mlen, #1
 363         b.eq    L0_bytes
 364         ldrb    w9,[data],#1
 365         adds    x7, x7, x9
 366         adc     x7, x7, x10
 367         eor     started_on_odd, started_on_odd, #1
 368
 369 L0_bytes:
 370 /*
 371  *              if (needs_swap)
 372  *                      partial = (partial << 8) + (partial >> 56);
 373  */
 374         cbz     needs_swap, 1f
 375         ror     x7, x7, #56
 376 1:
 377 /*
 378  *              sum += (partial >> 32) + (partial & 0xffffffff);
 379  *              sum = (sum >> 32) + (sum & 0xffffffff);
 380  *      }
 381  */
 382
 383         add     x3, x3, x7, lsr #32
 384         mov     w7, w7
 385         add     x3, x3, x7
 386         mov     w7, w3
 387         add     x3, x7, x3, lsr #32
 388
 389 L_continue:
 390         cmp     len, #0
 391         ldr     ptr_m, [m, #M_NEXT]                     // m = m->m_next
 392         b.gt    L_loop
 393
 394 /*
 395  *      final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
 396  *          ((sum >> 16) & 0xffff) + (sum & 0xffff);
 397  *      final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 398  *      final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 399  *      return (final_acc & 0xffff);
 400  * }
 401  */
 402
 403         mov     w4, #0x00ffff
 404         and     x0, x4, x3, lsr #48
 405         and     x1, x4, x3, lsr #32
 406         and     x2, x4, x3, lsr #16
 407         and     x3, x4, x3
 408         add     w0, w0, w1
 409         add     w2, w2, w3
 410         add     w0, w0, w2
 411         and     w1, w4, w0, lsr #16
 412         and     w0, w4, w0
 413         add     w0, w0, w1
 414         and     w1, w4, w0, lsr #16
 415         and     w0, w4, w0
 416         add     w0, w0, w1
 417         /*
 418          * If we were to 1's complement it (XOR with 0xffff):
 419          *
 420          * eor          w0, w0, w4
 421          */
 422         and     w0, w0, w4
 423
 424         ret     lr
 425
 426 Lin_cksum_whoops:
 427         adrp    x0, Lin_cksum_whoops_str@page
 428         add     x0, x0, Lin_cksum_whoops_str@pageoff
 429         bl      #CKSUM_ERR
 430         mov     x0, #-1
 431         ret     lr
 432
 433 Lin_cksum_whoops_str:
 434         .asciz  "os_cpu_in_cksum_mbuf: out of data\n"
 435         .align  5