bsd/dev/arm64/cpu_in_cksum.s

   1 /*
   2  * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*
  30  * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
  31  * with __arm64__ tagged ARM64_TODO .  This code revision is optimized based
  32  * on the 64-bit part in netinet/cpu_in_cksum.c
  33  *
  34  * cclee - CoreOS - Vector & Numerics. 06/20/2012.
  35  */
  36
  37 #ifdef KERNEL
  38 #define CKSUM_ERR _kprintf
  39 #else
  40 #ifndef LIBSYSCALL_INTERFACE
  41 #error "LIBSYSCALL_INTERFACE not defined"
  42 #endif /* !LIBSYSCALL_INTERFACE */
  43 #define CKSUM_ERR _fprintf_stderr
  44 #endif /* !KERNEL */
  45
  46 /*
  47  * XXX: adi@apple.com:
  48  *
  49  * Ugly, but we have little choice, since relying on genassym and <assym.s>
  50  * is not possible unless this code lives in osfmk.  Note also that this
  51  * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
  52  * authentic; it only cares about 3 fields.
  53  */
  54 #define M_NEXT  0
  55 #define M_DATA  16      // 8-byte address, would be aligned to 8-byte boundary
  56 #define M_LEN   24
  57
  58         .globl  _os_cpu_in_cksum_mbuf
  59         .text
  60         .align  4
  61 _os_cpu_in_cksum_mbuf:
  62
  63
  64 /*
  65  * 64-bit version.
  66  *
  67  * This function returns the partial 16-bit checksum accumulated in
  68  * a 32-bit variable (withouth 1's complement); caller is responsible
  69  * for folding the 32-bit sum into 16-bit and performinng the 1's
  70  * complement if applicable
  71  */
  72
  73 /*
  74  * uint32_t
  75  * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
  76  * {
  77  *      int mlen;
  78  *      uint64_t sum, partial;
  79  *      unsigned int final_acc;
  80  *      uint8_t *data;
  81  *      boolean_t needs_swap, started_on_odd;
  82  *
  83  *      VERIFY(len >= 0);
  84  *      VERIFY(off >= 0);
  85  *
  86  *      needs_swap = FALSE;
  87  *      started_on_odd = FALSE;
  88  *      sum = initial_sum;
  89  */
  90
  91         #define m               x0
  92         #define len             x1
  93         #define off             x2
  94         #define sum             x3
  95         #define needs_swap      x4
  96         #define started_on_odd  x5
  97         #define mlen                    x6
  98         #define Wmlen                   w6
  99         #define t       x7
 100         #define data    x8
 101
 102         mov     needs_swap, #0          // needs_swap = FALSE;
 103         mov     started_on_odd, #0      // started_on_odd = FALSE;
 104         mov     w3, w3                  // clear higher half
 105
 106
 107 /*
 108  *      for (;;) {
 109  *              if (PREDICT_FALSE(m == NULL)) {
 110  *                      CKSUM_ERR("%s: out of data\n", __func__);
 111  *                      return (-1);
 112  *              }
 113  *              mlen = m->m_len;
 114  *              if (mlen > off) {
 115  *                      mlen -= off;
 116  *                      data = mtod(m, uint8_t *) + off;
 117  *                      goto post_initial_offset;
 118  *              }
 119  *              off -= mlen;
 120  *              if (len == 0)
 121  *                      break;
 122  *              m = m->m_next;
 123  *      }
 124  */
 125
 126 0:
 127         cbz     m, Lin_cksum_whoops     // if (m == NULL) return -1;
 128         ldr     Wmlen, [m, #M_LEN]      // mlen = m->m_len;
 129         cmp     mlen, off
 130         b.le    1f
 131         ldr     data, [m, #M_DATA]      // mtod(m, uint8_t *)
 132         sub     mlen, mlen, off         // mlen -= off;
 133         add     data, data, off         // data = mtod(m, uint8_t *) + off;
 134         b       L_post_initial_offset
 135 1:
 136         sub     off, off, mlen
 137         cbnz    len, 2f
 138         mov     x0, x3
 139         ret     lr
 140 2:
 141         ldr     m, [m, #M_NEXT]
 142         b       0b
 143
 144 L_loop: // for (; len > 0; m = m->m_next) {
 145 /*
 146  *              if (PREDICT_FALSE(m == NULL)) {
 147  *                      CKSUM_ERR("%s: out of data\n", __func__);
 148  *                      return (-1);
 149  *              }
 150  *              mlen = m->m_len;
 151  *              data = mtod(m, uint8_t *);
 152  */
 153         cbz     m, Lin_cksum_whoops     // if (m == NULL) return -1;
 154         ldr     Wmlen, [m, #M_LEN]      // mlen = m->m_len;
 155         ldr     data, [m, #M_DATA]      // mtod(m, uint8_t *)
 156
 157 L_post_initial_offset:
 158 /*
 159  *              if (mlen == 0) continue;
 160  *              if (mlen > len) mlen = len;
 161  *              len -= mlen;
 162  */
 163
 164         cbz     mlen, L_continue
 165         cmp     mlen, len
 166         csel    mlen, mlen, len, le
 167         sub     len, len, mlen
 168
 169 /*
 170  *              partial = 0;
 171  *              if ((uintptr_t)data & 1) {
 172  *                      started_on_odd = !started_on_odd;
 173  *                      partial = *data << 8;
 174  *                      ++data;
 175  *                      --mlen;
 176  *              }
 177  *              needs_swap = started_on_odd;
 178  */
 179
 180         tst     data, #1
 181         mov     x7, #0
 182         mov     x10, #0
 183         b.eq    1f
 184         ldrb    w7, [data], #1
 185         eor     started_on_odd, started_on_odd, #1
 186         sub     mlen, mlen, #1
 187         lsl     w7, w7, #8
 188 1:
 189
 190
 191 /*
 192  *              if ((uintptr_t)data & 2) {
 193  *                      if (mlen < 2)
 194  *                              goto trailing_bytes;
 195  *                      partial += *(uint16_t *)(void *)data;
 196  *                      data += 2;
 197  *                      mlen -= 2;
 198  *              }
 199  */
 200         tst     data, #2
 201         mov     needs_swap, started_on_odd
 202         b.eq    1f
 203         cmp     mlen, #2
 204         b.lt    L_trailing_bytes
 205         ldrh    w9, [data], #2
 206         sub     mlen, mlen, #2
 207         add     w7, w7, w9
 208 1:
 209
 210 /*
 211  *              while (mlen >= 64) {
 212  *                      __builtin_prefetch(data + 32);
 213  *                      __builtin_prefetch(data + 64);
 214  *                      partial += *(uint32_t *)(void *)data;
 215  *                      partial += *(uint32_t *)(void *)(data + 4);
 216  *                      partial += *(uint32_t *)(void *)(data + 8);
 217  *                      partial += *(uint32_t *)(void *)(data + 12);
 218  *                      partial += *(uint32_t *)(void *)(data + 16);
 219  *                      partial += *(uint32_t *)(void *)(data + 20);
 220  *                      partial += *(uint32_t *)(void *)(data + 24);
 221  *                      partial += *(uint32_t *)(void *)(data + 28);
 222  *                      partial += *(uint32_t *)(void *)(data + 32);
 223  *                      partial += *(uint32_t *)(void *)(data + 36);
 224  *                      partial += *(uint32_t *)(void *)(data + 40);
 225  *                      partial += *(uint32_t *)(void *)(data + 44);
 226  *                      partial += *(uint32_t *)(void *)(data + 48);
 227  *                      partial += *(uint32_t *)(void *)(data + 52);
 228  *                      partial += *(uint32_t *)(void *)(data + 56);
 229  *                      partial += *(uint32_t *)(void *)(data + 60);
 230  *                      data += 64;
 231  *                      mlen -= 64;
 232  *              //      if (PREDICT_FALSE(partial & (3ULL << 62))) {
 233  *              //              if (needs_swap)
 234  *              //                      partial = (partial << 8) +
 235  *              //                          (partial >> 56);
 236  *              //              sum += (partial >> 32);
 237  *              //              sum += (partial & 0xffffffff);
 238  *              //              partial = 0;
 239  *              //      }
 240  *              }
 241 */
 242
 243         // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
 244         subs    mlen, mlen, #64
 245         b.lt    L32_bytes
 246
 247         // save used vector registers
 248         sub     sp, sp, #8*16
 249         mov     x11, sp
 250         st1.4s  {v0, v1, v2, v3}, [x11], #4*16
 251         st1.4s  {v4, v5, v6, v7}, [x11], #4*16
 252
 253         // spread partial into 8 8-byte registers in v0-v3
 254         fmov    s3, w7
 255         eor.16b v0, v0, v0
 256         eor.16b v1, v1, v1
 257         eor.16b v2, v2, v2
 258
 259         // load the 1st 64 bytes (16 32-bit words)
 260         ld1.4s  {v4,v5,v6,v7},[data],#64
 261
 262         // branch to finish off if mlen<64
 263         subs    mlen, mlen, #64
 264         b.lt    L64_finishup
 265
 266         /*
 267          * loop for loading and accumulating 16 32-bit words into
 268          * 8 8-byte accumulators per iteration.
 269          */
 270 L64_loop:
 271         subs        mlen, mlen, #64             // mlen -= 64
 272
 273         uadalp.2d   v0, v4
 274         ld1.4s      {v4},[data], #16
 275
 276         uadalp.2d   v1, v5
 277         ld1.4s      {v5},[data], #16
 278
 279         uadalp.2d   v2, v6
 280         ld1.4s      {v6},[data], #16
 281
 282         uadalp.2d   v3, v7
 283         ld1.4s      {v7},[data], #16
 284
 285         b.ge        L64_loop
 286
 287 L64_finishup:
 288         uadalp.2d   v0, v4
 289         uadalp.2d   v1, v5
 290         uadalp.2d   v2, v6
 291         uadalp.2d   v3, v7
 292
 293         add.2d      v0, v0, v1
 294         add.2d      v2, v2, v3
 295         addp.2d     d0, v0
 296         addp.2d     d2, v2
 297         add.2d      v0, v0, v2
 298         fmov        x7, d0                      // partial in x7 now
 299
 300         // restore used vector registers
 301         ld1.4s      {v0, v1, v2, v3}, [sp], #4*16
 302         ld1.4s      {v4, v5, v6, v7}, [sp], #4*16
 303
 304 L32_bytes:
 305         tst     mlen, #32
 306         b.eq    L16_bytes
 307         ldp     x9, x10, [data], #16
 308         ldp     x11, x12, [data], #16
 309         adds    x7, x7, x9
 310         mov     x9, #0
 311         adcs    x7, x7, x10
 312         adcs    x7, x7, x11
 313         adcs    x7, x7, x12
 314         adc     x7, x7, x9
 315
 316 L16_bytes:
 317         tst     mlen, #16
 318         b.eq    L8_bytes
 319         ldp     x9, x10, [data], #16
 320         adds    x7, x7, x9
 321         mov     x9, #0
 322         adcs    x7, x7, x10
 323         adc     x7, x7, x9
 324
 325 L8_bytes:
 326         tst     mlen, #8
 327         mov     x10, #0
 328         b.eq    L4_bytes
 329         ldr     x9,[data],#8
 330         adds    x7, x7, x9
 331         adc     x7, x7, x10
 332
 333 L4_bytes:
 334         tst     mlen, #4
 335         b.eq    L2_bytes
 336         ldr     w9,[data],#4
 337         adds    x7, x7, x9
 338         adc     x7, x7, x10
 339
 340 L2_bytes:
 341         tst     mlen, #2
 342         b.eq    L_trailing_bytes
 343         ldrh    w9,[data],#2
 344         adds    x7, x7, x9
 345         adc     x7, x7, x10
 346
 347 L_trailing_bytes:
 348         tst     mlen, #1
 349         b.eq    L0_bytes
 350         ldrb    w9,[data],#1
 351         adds    x7, x7, x9
 352         adc     x7, x7, x10
 353         eor     started_on_odd, started_on_odd, #1
 354
 355 L0_bytes:
 356 /*
 357  *              if (needs_swap)
 358  *                      partial = (partial << 8) + (partial >> 56);
 359  */
 360         cbz     needs_swap, 1f
 361         ror     x7, x7, #56
 362 1:
 363 /*
 364  *              sum += (partial >> 32) + (partial & 0xffffffff);
 365  *              sum = (sum >> 32) + (sum & 0xffffffff);
 366  *      }
 367  */
 368
 369         add     x3, x3, x7, lsr #32
 370         mov     w7, w7
 371         add     x3, x3, x7
 372         mov     w7, w3
 373         add     x3, x7, x3, lsr #32
 374
 375 L_continue:
 376         cmp     len, #0
 377         ldr     m, [m, #M_NEXT]                 // m = m->m_next
 378         b.gt    L_loop
 379
 380 /*
 381  *      final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
 382  *          ((sum >> 16) & 0xffff) + (sum & 0xffff);
 383  *      final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 384  *      final_acc = (final_acc >> 16) + (final_acc & 0xffff);
 385  *      return (final_acc & 0xffff);
 386  * }
 387  */
 388
 389         mov     w4, #0x00ffff
 390         and     x0, x4, x3, lsr #48
 391         and     x1, x4, x3, lsr #32
 392         and     x2, x4, x3, lsr #16
 393         and     x3, x4, x3
 394         add     w0, w0, w1
 395         add     w2, w2, w3
 396         add     w0, w0, w2
 397         and     w1, w4, w0, lsr #16
 398         and     w0, w4, w0
 399         add     w0, w0, w1
 400         and     w1, w4, w0, lsr #16
 401         and     w0, w4, w0
 402         add     w0, w0, w1
 403         /*
 404          * If we were to 1's complement it (XOR with 0xffff):
 405          *
 406          * eor          w0, w0, w4
 407          */
 408         and     w0, w0, w4
 409
 410         ret     lr
 411
 412 Lin_cksum_whoops:
 413         adrp    x0, Lin_cksum_whoops_str@page
 414         add     x0, x0, Lin_cksum_whoops_str@pageoff
 415         bl      #CKSUM_ERR
 416         mov     x0, #-1
 417         ret     lr
 418
 419 Lin_cksum_whoops_str:
 420         .asciz  "os_cpu_in_cksum_mbuf: out of data\n"
 421         .align  5