bsd/net/pf_norm.c

   1 /*
   2  * Copyright (c) 2007-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 /*      $apfw: pf_norm.c,v 1.10 2008/08/28 19:10:53 jhw Exp $ */
  30 /*      $OpenBSD: pf_norm.c,v 1.107 2006/04/16 00:59:52 pascoe Exp $ */
  31
  32 /*
  33  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
  34  * All rights reserved.
  35  *
  36  * Redistribution and use in source and binary forms, with or without
  37  * modification, are permitted provided that the following conditions
  38  * are met:
  39  * 1. Redistributions of source code must retain the above copyright
  40  *    notice, this list of conditions and the following disclaimer.
  41  * 2. Redistributions in binary form must reproduce the above copyright
  42  *    notice, this list of conditions and the following disclaimer in the
  43  *    documentation and/or other materials provided with the distribution.
  44  *
  45  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  46  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  47  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  48  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  49  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  50  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  51  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  52  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  53  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  54  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  55  */
  56
  57 #include <sys/param.h>
  58 #include <sys/systm.h>
  59 #include <sys/mbuf.h>
  60 #include <sys/filio.h>
  61 #include <sys/fcntl.h>
  62 #include <sys/socket.h>
  63 #include <sys/kernel.h>
  64 #include <sys/time.h>
  65 #include <sys/random.h>
  66 #include <sys/mcache.h>
  67
  68 #include <net/if.h>
  69 #include <net/if_types.h>
  70 #include <net/bpf.h>
  71 #include <net/route.h>
  72 #include <net/if_pflog.h>
  73
  74 #include <netinet/in.h>
  75 #include <netinet/in_var.h>
  76 #include <netinet/in_systm.h>
  77 #include <netinet/ip.h>
  78 #include <netinet/ip_var.h>
  79 #include <netinet/tcp.h>
  80 #include <netinet/tcp_seq.h>
  81 #include <netinet/tcp_fsm.h>
  82 #include <netinet/udp.h>
  83 #include <netinet/ip_icmp.h>
  84
  85 #if INET6
  86 #include <netinet/ip6.h>
  87 #endif /* INET6 */
  88
  89 #include <net/pfvar.h>
  90
  91 struct pf_frent {
  92         LIST_ENTRY(pf_frent)    fr_next;
  93         struct mbuf             *fr_m;
  94 #define fr_ip           fr_u.fru_ipv4
  95 #define fr_ip6          fr_u.fru_ipv6
  96         union {
  97                 struct ip       *fru_ipv4;
  98                 struct ip6_hdr  *fru_ipv6;
  99         } fr_u;
 100         struct ip6_frag         fr_ip6f_opt;
 101         int                     fr_ip6f_hlen;
 102 };
 103
 104 struct pf_frcache {
 105         LIST_ENTRY(pf_frcache) fr_next;
 106         uint16_t        fr_off;
 107         uint16_t        fr_end;
 108 };
 109
 110 #define PFFRAG_SEENLAST 0x0001          /* Seen the last fragment for this */
 111 #define PFFRAG_NOBUFFER 0x0002          /* Non-buffering fragment cache */
 112 #define PFFRAG_DROP     0x0004          /* Drop all fragments */
 113 #define BUFFER_FRAGMENTS(fr)    (!((fr)->fr_flags & PFFRAG_NOBUFFER))
 114
 115 struct pf_fragment {
 116         RB_ENTRY(pf_fragment) fr_entry;
 117         TAILQ_ENTRY(pf_fragment) frag_next;
 118         struct pf_addr  fr_srcx;
 119         struct pf_addr  fr_dstx;
 120         u_int8_t        fr_p;           /* protocol of this fragment */
 121         u_int8_t        fr_flags;       /* status flags */
 122         u_int16_t       fr_max;         /* fragment data max */
 123 #define fr_id           fr_uid.fru_id4
 124 #define fr_id6          fr_uid.fru_id6
 125         union {
 126                 u_int16_t       fru_id4;
 127                 u_int32_t       fru_id6;
 128         } fr_uid;
 129         int             fr_af;
 130         u_int32_t       fr_timeout;
 131 #define fr_queue        fr_u.fru_queue
 132 #define fr_cache        fr_u.fru_cache
 133         union {
 134                 LIST_HEAD(pf_fragq, pf_frent) fru_queue;        /* buffering */
 135                 LIST_HEAD(pf_cacheq, pf_frcache) fru_cache;     /* non-buf */
 136         } fr_u;
 137         uint32_t        fr_csum_flags;  /* checksum flags */
 138         uint32_t        fr_csum;        /* partial checksum value */
 139 };
 140
 141 static TAILQ_HEAD(pf_fragqueue, pf_fragment)    pf_fragqueue;
 142 static TAILQ_HEAD(pf_cachequeue, pf_fragment)   pf_cachequeue;
 143
 144 static __inline int  pf_frag_compare(struct pf_fragment *,
 145     struct pf_fragment *);
 146 static RB_HEAD(pf_frag_tree, pf_fragment)       pf_frag_tree, pf_cache_tree;
 147 RB_PROTOTYPE_SC(__private_extern__, pf_frag_tree, pf_fragment, fr_entry,
 148     pf_frag_compare);
 149 RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
 150
 151 /* Private prototypes */
 152 static void pf_ip6hdr2key(struct pf_fragment *, struct ip6_hdr *,
 153     struct ip6_frag *);
 154 static void pf_ip2key(struct pf_fragment *, struct ip *);
 155 static void pf_remove_fragment(struct pf_fragment *);
 156 static void pf_flush_fragments(void);
 157 static void pf_free_fragment(struct pf_fragment *);
 158 static struct pf_fragment *pf_find_fragment_by_key(struct pf_fragment *,
 159     struct pf_frag_tree *);
 160 static __inline struct pf_fragment *
 161     pf_find_fragment_by_ipv4_header(struct ip *, struct pf_frag_tree *);
 162 static __inline struct pf_fragment *
 163     pf_find_fragment_by_ipv6_header(struct ip6_hdr *, struct ip6_frag *,
 164     struct pf_frag_tree *);
 165 static struct mbuf *pf_reassemble(struct mbuf *, struct pf_fragment **,
 166     struct pf_frent *, int);
 167 static struct mbuf *pf_fragcache(struct mbuf **, struct ip *,
 168     struct pf_fragment **, int, int, int *);
 169 static struct mbuf *pf_reassemble6(struct mbuf **, struct pf_fragment **,
 170     struct pf_frent *, int);
 171 static struct mbuf *pf_frag6cache(struct mbuf **, struct ip6_hdr*,
 172     struct ip6_frag *, struct pf_fragment **, int, int, int, int *);
 173 static int pf_normalize_tcpopt(struct pf_rule *, int, struct pfi_kif *,
 174     struct pf_pdesc *, pbuf_t *, struct tcphdr *, int, int *);
 175
 176 #define DPFPRINTF(x) do {                               \
 177         if (pf_status.debug >= PF_DEBUG_MISC) {         \
 178                 printf("%s: ", __func__);               \
 179                 printf x ;                              \
 180         }                                               \
 181 } while (0)
 182
 183 /* Globals */
 184 struct pool              pf_frent_pl, pf_frag_pl;
 185 static struct pool       pf_cache_pl, pf_cent_pl;
 186 struct pool              pf_state_scrub_pl;
 187
 188 static int               pf_nfrents, pf_ncache;
 189
 190 void
 191 pf_normalize_init(void)
 192 {
 193         pool_init(&pf_frent_pl, sizeof (struct pf_frent), 0, 0, 0, "pffrent",
 194             NULL);
 195         pool_init(&pf_frag_pl, sizeof (struct pf_fragment), 0, 0, 0, "pffrag",
 196             NULL);
 197         pool_init(&pf_cache_pl, sizeof (struct pf_fragment), 0, 0, 0,
 198             "pffrcache", NULL);
 199         pool_init(&pf_cent_pl, sizeof (struct pf_frcache), 0, 0, 0, "pffrcent",
 200             NULL);
 201         pool_init(&pf_state_scrub_pl, sizeof (struct pf_state_scrub), 0, 0, 0,
 202             "pfstscr", NULL);
 203
 204         pool_sethiwat(&pf_frag_pl, PFFRAG_FRAG_HIWAT);
 205         pool_sethardlimit(&pf_frent_pl, PFFRAG_FRENT_HIWAT, NULL, 0);
 206         pool_sethardlimit(&pf_cache_pl, PFFRAG_FRCACHE_HIWAT, NULL, 0);
 207         pool_sethardlimit(&pf_cent_pl, PFFRAG_FRCENT_HIWAT, NULL, 0);
 208
 209         TAILQ_INIT(&pf_fragqueue);
 210         TAILQ_INIT(&pf_cachequeue);
 211 }
 212
 213 #if 0
 214 void
 215 pf_normalize_destroy(void)
 216 {
 217         pool_destroy(&pf_state_scrub_pl);
 218         pool_destroy(&pf_cent_pl);
 219         pool_destroy(&pf_cache_pl);
 220         pool_destroy(&pf_frag_pl);
 221         pool_destroy(&pf_frent_pl);
 222 }
 223 #endif
 224
 225 int
 226 pf_normalize_isempty(void)
 227 {
 228         return (TAILQ_EMPTY(&pf_fragqueue) && TAILQ_EMPTY(&pf_cachequeue));
 229 }
 230
 231 static __inline int
 232 pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
 233 {
 234         int     diff;
 235
 236         if ((diff = a->fr_af - b->fr_af))
 237                 return (diff);
 238         else if ((diff = a->fr_p - b->fr_p))
 239                 return (diff);
 240         else {
 241                 struct pf_addr *sa = &a->fr_srcx;
 242                 struct pf_addr *sb = &b->fr_srcx;
 243                 struct pf_addr *da = &a->fr_dstx;
 244                 struct pf_addr *db = &b->fr_dstx;
 245
 246                 switch (a->fr_af) {
 247 #ifdef INET
 248                 case AF_INET:
 249                         if ((diff = a->fr_id - b->fr_id))
 250                                 return (diff);
 251                         else if (sa->v4addr.s_addr < sb->v4addr.s_addr)
 252                                 return (-1);
 253                         else if (sa->v4addr.s_addr > sb->v4addr.s_addr)
 254                                 return (1);
 255                         else if (da->v4addr.s_addr < db->v4addr.s_addr)
 256                                 return (-1);
 257                         else if (da->v4addr.s_addr > db->v4addr.s_addr)
 258                                 return (1);
 259                         break;
 260 #endif
 261 #ifdef INET6
 262                 case AF_INET6:
 263                         if ((diff = a->fr_id6 - b->fr_id6))
 264                                 return (diff);
 265                         else if (sa->addr32[3] < sb->addr32[3])
 266                                 return (-1);
 267                         else if (sa->addr32[3] > sb->addr32[3])
 268                                 return (1);
 269                         else if (sa->addr32[2] < sb->addr32[2])
 270                                 return (-1);
 271                         else if (sa->addr32[2] > sb->addr32[2])
 272                                 return (1);
 273                         else if (sa->addr32[1] < sb->addr32[1])
 274                                 return (-1);
 275                         else if (sa->addr32[1] > sb->addr32[1])
 276                                 return (1);
 277                         else if (sa->addr32[0] < sb->addr32[0])
 278                                 return (-1);
 279                         else if (sa->addr32[0] > sb->addr32[0])
 280                                 return (1);
 281                         else if (da->addr32[3] < db->addr32[3])
 282                                 return (-1);
 283                         else if (da->addr32[3] > db->addr32[3])
 284                                 return (1);
 285                         else if (da->addr32[2] < db->addr32[2])
 286                                 return (-1);
 287                         else if (da->addr32[2] > db->addr32[2])
 288                                 return (1);
 289                         else if (da->addr32[1] < db->addr32[1])
 290                                 return (-1);
 291                         else if (da->addr32[1] > db->addr32[1])
 292                                 return (1);
 293                         else if (da->addr32[0] < db->addr32[0])
 294                                 return (-1);
 295                         else if (da->addr32[0] > db->addr32[0])
 296                                 return (1);
 297                         break;
 298 #endif
 299                 default:
 300                         VERIFY(!0 && "only IPv4 and IPv6 supported!");
 301                         break;
 302                 }
 303         }
 304         return (0);
 305 }
 306
 307 void
 308 pf_purge_expired_fragments(void)
 309 {
 310         struct pf_fragment *frag;
 311         u_int32_t expire = pf_time_second() -
 312             pf_default_rule.timeout[PFTM_FRAG];
 313
 314         while ((frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue)) != NULL) {
 315                 VERIFY(BUFFER_FRAGMENTS(frag));
 316                 if (frag->fr_timeout > expire)
 317                         break;
 318
 319                 switch (frag->fr_af) {
 320                 case AF_INET:
 321                       DPFPRINTF(("expiring IPv4 %d(0x%llx) from queue.\n",
 322                           ntohs(frag->fr_id),
 323                           (uint64_t)VM_KERNEL_ADDRPERM(frag)));
 324                       break;
 325                 case AF_INET6:
 326                       DPFPRINTF(("expiring IPv6 %d(0x%llx) from queue.\n",
 327                           ntohl(frag->fr_id6),
 328                           (uint64_t)VM_KERNEL_ADDRPERM(frag)));
 329                       break;
 330                 default:
 331                       VERIFY(0 && "only IPv4 and IPv6 supported");
 332                       break;
 333                 }
 334                 pf_free_fragment(frag);
 335         }
 336
 337         while ((frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue)) != NULL) {
 338                 VERIFY(!BUFFER_FRAGMENTS(frag));
 339                 if (frag->fr_timeout > expire)
 340                         break;
 341
 342                 switch (frag->fr_af) {
 343                 case AF_INET:
 344                       DPFPRINTF(("expiring IPv4 %d(0x%llx) from cache.\n",
 345                           ntohs(frag->fr_id),
 346                           (uint64_t)VM_KERNEL_ADDRPERM(frag)));
 347                       break;
 348                 case AF_INET6:
 349                       DPFPRINTF(("expiring IPv6 %d(0x%llx) from cache.\n",
 350                           ntohl(frag->fr_id6),
 351                           (uint64_t)VM_KERNEL_ADDRPERM(frag)));
 352                       break;
 353                 default:
 354                       VERIFY(0 && "only IPv4 and IPv6 supported");
 355                       break;
 356                 }
 357                 pf_free_fragment(frag);
 358                 VERIFY(TAILQ_EMPTY(&pf_cachequeue) ||
 359                     TAILQ_LAST(&pf_cachequeue, pf_cachequeue) != frag);
 360         }
 361 }
 362
 363 /*
 364  * Try to flush old fragments to make space for new ones
 365  */
 366
 367 static void
 368 pf_flush_fragments(void)
 369 {
 370         struct pf_fragment      *frag;
 371         int                      goal;
 372
 373         goal = pf_nfrents * 9 / 10;
 374         DPFPRINTF(("trying to free > %d frents\n",
 375             pf_nfrents - goal));
 376         while (goal < pf_nfrents) {
 377                 frag = TAILQ_LAST(&pf_fragqueue, pf_fragqueue);
 378                 if (frag == NULL)
 379                         break;
 380                 pf_free_fragment(frag);
 381         }
 382
 383
 384         goal = pf_ncache * 9 / 10;
 385         DPFPRINTF(("trying to free > %d cache entries\n",
 386             pf_ncache - goal));
 387         while (goal < pf_ncache) {
 388                 frag = TAILQ_LAST(&pf_cachequeue, pf_cachequeue);
 389                 if (frag == NULL)
 390                         break;
 391                 pf_free_fragment(frag);
 392         }
 393 }
 394
 395 /* Frees the fragments and all associated entries */
 396
 397 static void
 398 pf_free_fragment(struct pf_fragment *frag)
 399 {
 400         struct pf_frent         *frent;
 401         struct pf_frcache       *frcache;
 402
 403         /* Free all fragments */
 404         if (BUFFER_FRAGMENTS(frag)) {
 405                 for (frent = LIST_FIRST(&frag->fr_queue); frent;
 406                     frent = LIST_FIRST(&frag->fr_queue)) {
 407                         LIST_REMOVE(frent, fr_next);
 408
 409                         m_freem(frent->fr_m);
 410                         pool_put(&pf_frent_pl, frent);
 411                         pf_nfrents--;
 412                 }
 413         } else {
 414                 for (frcache = LIST_FIRST(&frag->fr_cache); frcache;
 415                     frcache = LIST_FIRST(&frag->fr_cache)) {
 416                         LIST_REMOVE(frcache, fr_next);
 417
 418                         VERIFY(LIST_EMPTY(&frag->fr_cache) ||
 419                             LIST_FIRST(&frag->fr_cache)->fr_off >
 420                             frcache->fr_end);
 421
 422                         pool_put(&pf_cent_pl, frcache);
 423                         pf_ncache--;
 424                 }
 425         }
 426
 427         pf_remove_fragment(frag);
 428 }
 429
 430 static void
 431 pf_ip6hdr2key(struct pf_fragment *key, struct ip6_hdr *ip6,
 432     struct ip6_frag *fh)
 433 {
 434         key->fr_p = fh->ip6f_nxt;
 435         key->fr_id6 = fh->ip6f_ident;
 436         key->fr_af = AF_INET6;
 437         key->fr_srcx.v6addr = ip6->ip6_src;
 438         key->fr_dstx.v6addr = ip6->ip6_dst;
 439 }
 440
 441 static void
 442 pf_ip2key(struct pf_fragment *key, struct ip *ip)
 443 {
 444         key->fr_p = ip->ip_p;
 445         key->fr_id = ip->ip_id;
 446         key->fr_af = AF_INET;
 447         key->fr_srcx.v4addr.s_addr = ip->ip_src.s_addr;
 448         key->fr_dstx.v4addr.s_addr = ip->ip_dst.s_addr;
 449 }
 450
 451 static struct pf_fragment *
 452 pf_find_fragment_by_key(struct pf_fragment *key, struct pf_frag_tree *tree)
 453 {
 454         struct pf_fragment *frag;
 455
 456         frag = RB_FIND(pf_frag_tree, tree, key);
 457         if (frag != NULL) {
 458                 /* XXX Are we sure we want to update the timeout? */
 459                 frag->fr_timeout = pf_time_second();
 460                 if (BUFFER_FRAGMENTS(frag)) {
 461                         TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
 462                         TAILQ_INSERT_HEAD(&pf_fragqueue, frag, frag_next);
 463                 } else {
 464                         TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
 465                         TAILQ_INSERT_HEAD(&pf_cachequeue, frag, frag_next);
 466                 }
 467         }
 468
 469         return (frag);
 470 }
 471
 472 static __inline struct pf_fragment *
 473 pf_find_fragment_by_ipv4_header(struct ip *ip, struct pf_frag_tree *tree)
 474 {
 475         struct pf_fragment key;
 476         pf_ip2key(&key, ip);
 477         return pf_find_fragment_by_key(&key, tree);
 478 }
 479
 480 static __inline struct pf_fragment *
 481 pf_find_fragment_by_ipv6_header(struct ip6_hdr *ip6, struct ip6_frag *fh,
 482     struct pf_frag_tree *tree)
 483 {
 484       struct pf_fragment key;
 485       pf_ip6hdr2key(&key, ip6, fh);
 486       return pf_find_fragment_by_key(&key, tree);
 487 }
 488
 489 /* Removes a fragment from the fragment queue and frees the fragment */
 490
 491 static void
 492 pf_remove_fragment(struct pf_fragment *frag)
 493 {
 494         if (BUFFER_FRAGMENTS(frag)) {
 495                 RB_REMOVE(pf_frag_tree, &pf_frag_tree, frag);
 496                 TAILQ_REMOVE(&pf_fragqueue, frag, frag_next);
 497                 pool_put(&pf_frag_pl, frag);
 498         } else {
 499                 RB_REMOVE(pf_frag_tree, &pf_cache_tree, frag);
 500                 TAILQ_REMOVE(&pf_cachequeue, frag, frag_next);
 501                 pool_put(&pf_cache_pl, frag);
 502         }
 503 }
 504
 505 #define FR_IP_OFF(fr)   ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
 506 static struct mbuf *
 507 pf_reassemble(struct mbuf *m0, struct pf_fragment **frag,
 508     struct pf_frent *frent, int mff)
 509 {
 510         struct mbuf     *m = m0, *m2;
 511         struct pf_frent *frea, *next;
 512         struct pf_frent *frep = NULL;
 513         struct ip       *ip = frent->fr_ip;
 514         uint32_t         hlen = ip->ip_hl << 2;
 515         u_int16_t        off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
 516         u_int16_t        ip_len = ntohs(ip->ip_len) - ip->ip_hl * 4;
 517         u_int16_t        fr_max = ip_len + off;
 518         uint32_t         csum, csum_flags;
 519
 520         VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag));
 521
 522         /*
 523          * Leverage partial checksum offload for IP fragments.  Narrow down
 524          * the scope to cover only UDP without IP options, as that is the
 525          * most common case.
 526          *
 527          * Perform 1's complement adjustment of octets that got included/
 528          * excluded in the hardware-calculated checksum value.  Ignore cases
 529          * where the value includes the entire IPv4 header span, as the sum
 530          * for those octets would already be 0 by the time we get here; IP
 531          * has already performed its header checksum validation.  Also take
 532          * care of any trailing bytes and subtract out their partial sum.
 533          */
 534         if (ip->ip_p == IPPROTO_UDP && hlen == sizeof (struct ip) &&
 535             (m->m_pkthdr.csum_flags &
 536             (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
 537             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
 538                 uint32_t start = m->m_pkthdr.csum_rx_start;
 539                 int32_t trailer = (m_pktlen(m) - ntohs(ip->ip_len));
 540                 uint32_t swbytes = (uint32_t)trailer;
 541
 542                 csum = m->m_pkthdr.csum_rx_val;
 543
 544                 ASSERT(trailer >= 0);
 545                 if ((start != 0 && start != hlen) || trailer != 0) {
 546 #if BYTE_ORDER != BIG_ENDIAN
 547                         if (start < hlen) {
 548                                 HTONS(ip->ip_len);
 549                                 HTONS(ip->ip_off);
 550                         }
 551 #endif /* BYTE_ORDER != BIG_ENDIAN */
 552                         /* callee folds in sum */
 553                         csum = m_adj_sum16(m, start, hlen,
 554                             (ip->ip_len - hlen), csum);
 555                         if (hlen > start)
 556                                 swbytes += (hlen - start);
 557                         else
 558                                 swbytes += (start - hlen);
 559 #if BYTE_ORDER != BIG_ENDIAN
 560                         if (start < hlen) {
 561                                 NTOHS(ip->ip_off);
 562                                 NTOHS(ip->ip_len);
 563                         }
 564 #endif /* BYTE_ORDER != BIG_ENDIAN */
 565                 }
 566                 csum_flags = m->m_pkthdr.csum_flags;
 567
 568                 if (swbytes != 0)
 569                         udp_in_cksum_stats(swbytes);
 570                 if (trailer != 0)
 571                         m_adj(m, -trailer);
 572         } else {
 573                 csum = 0;
 574                 csum_flags = 0;
 575         }
 576
 577         /* Invalidate checksum */
 578         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
 579
 580         /* Strip off ip header */
 581         m->m_data += hlen;
 582         m->m_len -= hlen;
 583
 584         /* Create a new reassembly queue for this packet */
 585         if (*frag == NULL) {
 586                 *frag = pool_get(&pf_frag_pl, PR_NOWAIT);
 587                 if (*frag == NULL) {
 588                         pf_flush_fragments();
 589                         *frag = pool_get(&pf_frag_pl, PR_NOWAIT);
 590                         if (*frag == NULL)
 591                                 goto drop_fragment;
 592                 }
 593
 594                 (*frag)->fr_flags = 0;
 595                 (*frag)->fr_max = 0;
 596                 (*frag)->fr_af = AF_INET;
 597                 (*frag)->fr_srcx.v4addr = frent->fr_ip->ip_src;
 598                 (*frag)->fr_dstx.v4addr = frent->fr_ip->ip_dst;
 599                 (*frag)->fr_p = frent->fr_ip->ip_p;
 600                 (*frag)->fr_id = frent->fr_ip->ip_id;
 601                 (*frag)->fr_timeout = pf_time_second();
 602                 if (csum_flags != 0) {
 603                         (*frag)->fr_csum_flags = csum_flags;
 604                         (*frag)->fr_csum = csum;
 605                 }
 606                 LIST_INIT(&(*frag)->fr_queue);
 607
 608                 RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag);
 609                 TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next);
 610
 611                 /* We do not have a previous fragment */
 612                 frep = NULL;
 613                 goto insert;
 614         }
 615
 616         /*
 617          * If this fragment contains similar checksum offload info
 618          * as that of the existing ones, accumulate checksum.  Otherwise,
 619          * invalidate checksum offload info for the entire datagram.
 620          */
 621         if (csum_flags != 0 && csum_flags == (*frag)->fr_csum_flags)
 622                 (*frag)->fr_csum += csum;
 623         else if ((*frag)->fr_csum_flags != 0)
 624                 (*frag)->fr_csum_flags = 0;
 625
 626         /*
 627          * Find a fragment after the current one:
 628          *  - off contains the real shifted offset.
 629          */
 630         LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
 631                 if (FR_IP_OFF(frea) > off)
 632                         break;
 633                 frep = frea;
 634         }
 635
 636         VERIFY(frep != NULL || frea != NULL);
 637
 638         if (frep != NULL &&
 639             FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl *
 640             4 > off) {
 641                 u_int16_t       precut;
 642
 643                 precut = FR_IP_OFF(frep) + ntohs(frep->fr_ip->ip_len) -
 644                     frep->fr_ip->ip_hl * 4 - off;
 645                 if (precut >= ip_len)
 646                         goto drop_fragment;
 647                 m_adj(frent->fr_m, precut);
 648                 DPFPRINTF(("overlap -%d\n", precut));
 649                 /* Enforce 8 byte boundaries */
 650                 ip->ip_off = htons(ntohs(ip->ip_off) + (precut >> 3));
 651                 off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
 652                 ip_len -= precut;
 653                 ip->ip_len = htons(ip_len);
 654         }
 655
 656         for (; frea != NULL && ip_len + off > FR_IP_OFF(frea);
 657             frea = next) {
 658                 u_int16_t       aftercut;
 659
 660                 aftercut = ip_len + off - FR_IP_OFF(frea);
 661                 DPFPRINTF(("adjust overlap %d\n", aftercut));
 662                 if (aftercut < ntohs(frea->fr_ip->ip_len) - frea->fr_ip->ip_hl
 663                     * 4) {
 664                         frea->fr_ip->ip_len =
 665                             htons(ntohs(frea->fr_ip->ip_len) - aftercut);
 666                         frea->fr_ip->ip_off = htons(ntohs(frea->fr_ip->ip_off) +
 667                             (aftercut >> 3));
 668                         m_adj(frea->fr_m, aftercut);
 669                         break;
 670                 }
 671
 672                 /* This fragment is completely overlapped, lose it */
 673                 next = LIST_NEXT(frea, fr_next);
 674                 m_freem(frea->fr_m);
 675                 LIST_REMOVE(frea, fr_next);
 676                 pool_put(&pf_frent_pl, frea);
 677                 pf_nfrents--;
 678         }
 679
 680 insert:
 681         /* Update maximum data size */
 682         if ((*frag)->fr_max < fr_max)
 683                 (*frag)->fr_max = fr_max;
 684         /* This is the last segment */
 685         if (!mff)
 686                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
 687
 688         if (frep == NULL)
 689                 LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
 690         else
 691                 LIST_INSERT_AFTER(frep, frent, fr_next);
 692
 693         /* Check if we are completely reassembled */
 694         if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
 695                 return (NULL);
 696
 697         /* Check if we have all the data */
 698         off = 0;
 699         for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
 700                 next = LIST_NEXT(frep, fr_next);
 701
 702                 off += ntohs(frep->fr_ip->ip_len) - frep->fr_ip->ip_hl * 4;
 703                 if (off < (*frag)->fr_max &&
 704                     (next == NULL || FR_IP_OFF(next) != off)) {
 705                         DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
 706                             off, next == NULL ? -1 : FR_IP_OFF(next),
 707                             (*frag)->fr_max));
 708                         return (NULL);
 709                 }
 710         }
 711         DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
 712         if (off < (*frag)->fr_max)
 713                 return (NULL);
 714
 715         /* We have all the data */
 716         frent = LIST_FIRST(&(*frag)->fr_queue);
 717         VERIFY(frent != NULL);
 718         if ((frent->fr_ip->ip_hl << 2) + off > IP_MAXPACKET) {
 719                 DPFPRINTF(("drop: too big: %d\n", off));
 720                 pf_free_fragment(*frag);
 721                 *frag = NULL;
 722                 return (NULL);
 723         }
 724         next = LIST_NEXT(frent, fr_next);
 725
 726         /* Magic from ip_input */
 727         ip = frent->fr_ip;
 728         m = frent->fr_m;
 729         m2 = m->m_next;
 730         m->m_next = NULL;
 731         m_cat(m, m2);
 732         pool_put(&pf_frent_pl, frent);
 733         pf_nfrents--;
 734         for (frent = next; frent != NULL; frent = next) {
 735                 next = LIST_NEXT(frent, fr_next);
 736
 737                 m2 = frent->fr_m;
 738                 pool_put(&pf_frent_pl, frent);
 739                 pf_nfrents--;
 740                 m_cat(m, m2);
 741         }
 742
 743         ip->ip_src = (*frag)->fr_srcx.v4addr;
 744         ip->ip_dst = (*frag)->fr_dstx.v4addr;
 745
 746         if ((*frag)->fr_csum_flags != 0) {
 747                 csum = (*frag)->fr_csum;
 748
 749                 ADDCARRY(csum);
 750
 751                 m->m_pkthdr.csum_rx_val = csum;
 752                 m->m_pkthdr.csum_rx_start = sizeof (struct ip);
 753                 m->m_pkthdr.csum_flags = (*frag)->fr_csum_flags;
 754         } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
 755             (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
 756                 /* loopback checksums are always OK */
 757                 m->m_pkthdr.csum_data = 0xffff;
 758                 m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
 759                 m->m_pkthdr.csum_flags =
 760                     CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
 761                     CSUM_IP_CHECKED | CSUM_IP_VALID;
 762         }
 763
 764         /* Remove from fragment queue */
 765         pf_remove_fragment(*frag);
 766         *frag = NULL;
 767
 768         hlen = ip->ip_hl << 2;
 769         ip->ip_len = htons(off + hlen);
 770         m->m_len += hlen;
 771         m->m_data -= hlen;
 772
 773         /* some debugging cruft by sklower, below, will go away soon */
 774         /* XXX this should be done elsewhere */
 775         if (m->m_flags & M_PKTHDR) {
 776                 int plen = 0;
 777                 for (m2 = m; m2; m2 = m2->m_next)
 778                         plen += m2->m_len;
 779                 m->m_pkthdr.len = plen;
 780         }
 781
 782         DPFPRINTF(("complete: 0x%llx(%d)\n",
 783             (uint64_t)VM_KERNEL_ADDRPERM(m), ntohs(ip->ip_len)));
 784         return (m);
 785
 786 drop_fragment:
 787         /* Oops - fail safe - drop packet */
 788         pool_put(&pf_frent_pl, frent);
 789         pf_nfrents--;
 790         m_freem(m);
 791         return (NULL);
 792 }
 793
 794 static struct mbuf *
 795 pf_fragcache(struct mbuf **m0, struct ip *h, struct pf_fragment **frag, int mff,
 796     int drop, int *nomem)
 797 {
 798         struct mbuf             *m = *m0;
 799         struct pf_frcache       *frp, *fra, *cur = NULL;
 800         int                      ip_len = ntohs(h->ip_len) - (h->ip_hl << 2);
 801         u_int16_t                off = ntohs(h->ip_off) << 3;
 802         u_int16_t                fr_max = ip_len + off;
 803         int                      hosed = 0;
 804
 805         VERIFY(*frag == NULL || !BUFFER_FRAGMENTS(*frag));
 806
 807         /* Create a new range queue for this packet */
 808         if (*frag == NULL) {
 809                 *frag = pool_get(&pf_cache_pl, PR_NOWAIT);
 810                 if (*frag == NULL) {
 811                         pf_flush_fragments();
 812                         *frag = pool_get(&pf_cache_pl, PR_NOWAIT);
 813                         if (*frag == NULL)
 814                                 goto no_mem;
 815                 }
 816
 817                 /* Get an entry for the queue */
 818                 cur = pool_get(&pf_cent_pl, PR_NOWAIT);
 819                 if (cur == NULL) {
 820                         pool_put(&pf_cache_pl, *frag);
 821                         *frag = NULL;
 822                         goto no_mem;
 823                 }
 824                 pf_ncache++;
 825
 826                 (*frag)->fr_flags = PFFRAG_NOBUFFER;
 827                 (*frag)->fr_max = 0;
 828                 (*frag)->fr_af = AF_INET;
 829                 (*frag)->fr_srcx.v4addr = h->ip_src;
 830                 (*frag)->fr_dstx.v4addr = h->ip_dst;
 831                 (*frag)->fr_p = h->ip_p;
 832                 (*frag)->fr_id = h->ip_id;
 833                 (*frag)->fr_timeout = pf_time_second();
 834
 835                 cur->fr_off = off;
 836                 cur->fr_end = fr_max;
 837                 LIST_INIT(&(*frag)->fr_cache);
 838                 LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
 839
 840                 RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag);
 841                 TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next);
 842
 843                 DPFPRINTF(("fragcache[%d]: new %d-%d\n", h->ip_id, off,
 844                     fr_max));
 845
 846                 goto pass;
 847         }
 848
 849         /*
 850          * Find a fragment after the current one:
 851          *  - off contains the real shifted offset.
 852          */
 853         frp = NULL;
 854         LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
 855                 if (fra->fr_off > off)
 856                         break;
 857                 frp = fra;
 858         }
 859
 860         VERIFY(frp != NULL || fra != NULL);
 861
 862         if (frp != NULL) {
 863                 int     precut;
 864
 865                 precut = frp->fr_end - off;
 866                 if (precut >= ip_len) {
 867                         /* Fragment is entirely a duplicate */
 868                         DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
 869                             h->ip_id, frp->fr_off, frp->fr_end, off, fr_max));
 870                         goto drop_fragment;
 871                 }
 872                 if (precut == 0) {
 873                         /* They are adjacent.  Fixup cache entry */
 874                         DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
 875                             h->ip_id, frp->fr_off, frp->fr_end, off, fr_max));
 876                         frp->fr_end = fr_max;
 877                 } else if (precut > 0) {
 878                         /*
 879                          * The first part of this payload overlaps with a
 880                          * fragment that has already been passed.
 881                          * Need to trim off the first part of the payload.
 882                          * But to do so easily, we need to create another
 883                          * mbuf to throw the original header into.
 884                          */
 885
 886                         DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
 887                             h->ip_id, precut, frp->fr_off, frp->fr_end, off,
 888                             fr_max));
 889
 890                         off += precut;
 891                         fr_max -= precut;
 892                         /* Update the previous frag to encompass this one */
 893                         frp->fr_end = fr_max;
 894
 895                         if (!drop) {
 896                                 /*
 897                                  * XXX Optimization opportunity
 898                                  * This is a very heavy way to trim the payload.
 899                                  * we could do it much faster by diddling mbuf
 900                                  * internals but that would be even less legible
 901                                  * than this mbuf magic.  For my next trick,
 902                                  * I'll pull a rabbit out of my laptop.
 903                                  */
 904                                 *m0 = m_copym(m, 0, h->ip_hl << 2, M_NOWAIT);
 905                                 if (*m0 == NULL)
 906                                         goto no_mem;
 907                                 VERIFY((*m0)->m_next == NULL);
 908                                 m_adj(m, precut + (h->ip_hl << 2));
 909                                 m_cat(*m0, m);
 910                                 m = *m0;
 911                                 if (m->m_flags & M_PKTHDR) {
 912                                         int plen = 0;
 913                                         struct mbuf *t;
 914                                         for (t = m; t; t = t->m_next)
 915                                                 plen += t->m_len;
 916                                         m->m_pkthdr.len = plen;
 917                                 }
 918
 919
 920                                 h = mtod(m, struct ip *);
 921
 922
 923                                 VERIFY((int)m->m_len ==
 924                                     ntohs(h->ip_len) - precut);
 925                                 h->ip_off = htons(ntohs(h->ip_off) +
 926                                     (precut >> 3));
 927                                 h->ip_len = htons(ntohs(h->ip_len) - precut);
 928                         } else {
 929                                 hosed++;
 930                         }
 931                 } else {
 932                         /* There is a gap between fragments */
 933
 934                         DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
 935                             h->ip_id, -precut, frp->fr_off, frp->fr_end, off,
 936                             fr_max));
 937
 938                         cur = pool_get(&pf_cent_pl, PR_NOWAIT);
 939                         if (cur == NULL)
 940                                 goto no_mem;
 941                         pf_ncache++;
 942
 943                         cur->fr_off = off;
 944                         cur->fr_end = fr_max;
 945                         LIST_INSERT_AFTER(frp, cur, fr_next);
 946                 }
 947         }
 948
 949         if (fra != NULL) {
 950                 int     aftercut;
 951                 int     merge = 0;
 952
 953                 aftercut = fr_max - fra->fr_off;
 954                 if (aftercut == 0) {
 955                         /* Adjacent fragments */
 956                         DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
 957                             h->ip_id, off, fr_max, fra->fr_off, fra->fr_end));
 958                         fra->fr_off = off;
 959                         merge = 1;
 960                 } else if (aftercut > 0) {
 961                         /* Need to chop off the tail of this fragment */
 962                         DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
 963                             h->ip_id, aftercut, off, fr_max, fra->fr_off,
 964                             fra->fr_end));
 965                         fra->fr_off = off;
 966                         fr_max -= aftercut;
 967
 968                         merge = 1;
 969
 970                         if (!drop) {
 971                                 m_adj(m, -aftercut);
 972                                 if (m->m_flags & M_PKTHDR) {
 973                                         int plen = 0;
 974                                         struct mbuf *t;
 975                                         for (t = m; t; t = t->m_next)
 976                                                 plen += t->m_len;
 977                                         m->m_pkthdr.len = plen;
 978                                 }
 979                                 h = mtod(m, struct ip *);
 980                                 VERIFY((int)m->m_len ==
 981                                     ntohs(h->ip_len) - aftercut);
 982                                 h->ip_len = htons(ntohs(h->ip_len) - aftercut);
 983                         } else {
 984                                 hosed++;
 985                         }
 986                 } else if (frp == NULL) {
 987                         /* There is a gap between fragments */
 988                         DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
 989                             h->ip_id, -aftercut, off, fr_max, fra->fr_off,
 990                             fra->fr_end));
 991
 992                         cur = pool_get(&pf_cent_pl, PR_NOWAIT);
 993                         if (cur == NULL)
 994                                 goto no_mem;
 995                         pf_ncache++;
 996
 997                         cur->fr_off = off;
 998                         cur->fr_end = fr_max;
 999                         LIST_INSERT_BEFORE(fra, cur, fr_next);
1000                 }
1001
1002
1003                 /* Need to glue together two separate fragment descriptors */
1004                 if (merge) {
1005                         if (cur && fra->fr_off <= cur->fr_end) {
1006                                 /* Need to merge in a previous 'cur' */
1007                                 DPFPRINTF(("fragcache[%d]: adjacent(merge "
1008                                     "%d-%d) %d-%d (%d-%d)\n",
1009                                     h->ip_id, cur->fr_off, cur->fr_end, off,
1010                                     fr_max, fra->fr_off, fra->fr_end));
1011                                 fra->fr_off = cur->fr_off;
1012                                 LIST_REMOVE(cur, fr_next);
1013                                 pool_put(&pf_cent_pl, cur);
1014                                 pf_ncache--;
1015                                 cur = NULL;
1016
1017                         } else if (frp && fra->fr_off <= frp->fr_end) {
1018                                 /* Need to merge in a modified 'frp' */
1019                                 VERIFY(cur == NULL);
1020                                 DPFPRINTF(("fragcache[%d]: adjacent(merge "
1021                                     "%d-%d) %d-%d (%d-%d)\n",
1022                                     h->ip_id, frp->fr_off, frp->fr_end, off,
1023                                     fr_max, fra->fr_off, fra->fr_end));
1024                                 fra->fr_off = frp->fr_off;
1025                                 LIST_REMOVE(frp, fr_next);
1026                                 pool_put(&pf_cent_pl, frp);
1027                                 pf_ncache--;
1028                                 frp = NULL;
1029
1030                         }
1031                 }
1032         }
1033
1034         if (hosed) {
1035                 /*
1036                  * We must keep tracking the overall fragment even when
1037                  * we're going to drop it anyway so that we know when to
1038                  * free the overall descriptor.  Thus we drop the frag late.
1039                  */
1040                 goto drop_fragment;
1041         }
1042
1043
1044 pass:
1045         /* Update maximum data size */
1046         if ((*frag)->fr_max < fr_max)
1047                 (*frag)->fr_max = fr_max;
1048
1049         /* This is the last segment */
1050         if (!mff)
1051                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1052
1053         /* Check if we are completely reassembled */
1054         if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
1055             LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
1056             LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
1057                 /* Remove from fragment queue */
1058                 DPFPRINTF(("fragcache[%d]: done 0-%d\n", h->ip_id,
1059                     (*frag)->fr_max));
1060                 pf_free_fragment(*frag);
1061                 *frag = NULL;
1062         }
1063
1064         return (m);
1065
1066 no_mem:
1067         *nomem = 1;
1068
1069         /* Still need to pay attention to !IP_MF */
1070         if (!mff && *frag != NULL)
1071                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1072
1073         m_freem(m);
1074         return (NULL);
1075
1076 drop_fragment:
1077
1078         /* Still need to pay attention to !IP_MF */
1079         if (!mff && *frag != NULL)
1080                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1081
1082         if (drop) {
1083                 /* This fragment has been deemed bad.  Don't reass */
1084                 if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
1085                         DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
1086                             h->ip_id));
1087                 (*frag)->fr_flags |= PFFRAG_DROP;
1088         }
1089
1090         m_freem(m);
1091         return (NULL);
1092 }
1093
1094 #define FR_IP6_OFF(fr) \
1095         (ntohs((fr)->fr_ip6f_opt.ip6f_offlg & IP6F_OFF_MASK))
1096 #define FR_IP6_PLEN(fr) (ntohs((fr)->fr_ip6->ip6_plen))
1097 struct mbuf *
1098 pf_reassemble6(struct mbuf **m0, struct pf_fragment **frag,
1099     struct pf_frent *frent, int mff)
1100 {
1101         struct mbuf *m, *m2;
1102         struct pf_frent *frea, *frep, *next;
1103         struct ip6_hdr *ip6;
1104         struct ip6_frag *ip6f;
1105         int plen, off, fr_max;
1106         uint32_t uoff, csum, csum_flags;
1107
1108         VERIFY(*frag == NULL || BUFFER_FRAGMENTS(*frag));
1109         m = *m0;
1110         frep = NULL;
1111         ip6 = frent->fr_ip6;
1112         ip6f = &frent->fr_ip6f_opt;
1113         off = FR_IP6_OFF(frent);
1114         uoff = frent->fr_ip6f_hlen;
1115         plen = FR_IP6_PLEN(frent);
1116         fr_max = off + plen - (frent->fr_ip6f_hlen - sizeof *ip6);
1117
1118         DPFPRINTF(("0x%llx IPv6 frag plen %u off %u fr_ip6f_hlen %u "
1119             "fr_max %u m_len %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, off,
1120             frent->fr_ip6f_hlen, fr_max, m->m_len));
1121
1122         /*
1123          * Leverage partial checksum offload for simple UDP/IP fragments,
1124          * as that is the most common case.
1125          *
1126          * Perform 1's complement adjustment of octets that got included/
1127          * excluded in the hardware-calculated checksum value.  Also take
1128          * care of any trailing bytes and subtract out their partial sum.
1129          */
1130         if (ip6f->ip6f_nxt == IPPROTO_UDP &&
1131             uoff == (sizeof (*ip6) + sizeof (*ip6f)) &&
1132             (m->m_pkthdr.csum_flags &
1133             (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
1134             (CSUM_DATA_VALID | CSUM_PARTIAL)) {
1135                 uint32_t start = m->m_pkthdr.csum_rx_start;
1136                 uint32_t ip_len = (sizeof (*ip6) + ntohs(ip6->ip6_plen));
1137                 int32_t trailer = (m_pktlen(m) - ip_len);
1138                 uint32_t swbytes = (uint32_t)trailer;
1139
1140                 csum = m->m_pkthdr.csum_rx_val;
1141
1142                 ASSERT(trailer >= 0);
1143                 if (start != uoff || trailer != 0) {
1144                         uint16_t s = 0, d = 0;
1145
1146                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
1147                                 s = ip6->ip6_src.s6_addr16[1];
1148                                 ip6->ip6_src.s6_addr16[1] = 0 ;
1149                         }
1150                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
1151                                 d = ip6->ip6_dst.s6_addr16[1];
1152                                 ip6->ip6_dst.s6_addr16[1] = 0;
1153                         }
1154
1155                         /* callee folds in sum */
1156                         csum = m_adj_sum16(m, start, uoff,
1157                             (ip_len - uoff), csum);
1158                         if (uoff > start)
1159                                 swbytes += (uoff - start);
1160                         else
1161                                 swbytes += (start - uoff);
1162
1163                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
1164                                 ip6->ip6_src.s6_addr16[1] = s;
1165                         if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
1166                                 ip6->ip6_dst.s6_addr16[1] = d;
1167
1168                 }
1169                 csum_flags = m->m_pkthdr.csum_flags;
1170
1171                 if (swbytes != 0)
1172                         udp_in6_cksum_stats(swbytes);
1173                 if (trailer != 0)
1174                         m_adj(m, -trailer);
1175         } else {
1176                 csum = 0;
1177                 csum_flags = 0;
1178         }
1179
1180         /* Invalidate checksum */
1181         m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
1182
1183         /* strip off headers up to the fragment payload */
1184         m->m_data += frent->fr_ip6f_hlen;
1185         m->m_len -= frent->fr_ip6f_hlen;
1186
1187         /* Create a new reassembly queue for this packet */
1188         if (*frag == NULL) {
1189                 *frag = pool_get(&pf_frag_pl, PR_NOWAIT);
1190                 if (*frag == NULL) {
1191                         pf_flush_fragments();
1192                         *frag = pool_get(&pf_frag_pl, PR_NOWAIT);
1193                         if (*frag == NULL)
1194                                 goto drop_fragment;
1195                 }
1196
1197                 (*frag)->fr_flags = 0;
1198                 (*frag)->fr_max = 0;
1199                 (*frag)->fr_af = AF_INET6;
1200                 (*frag)->fr_srcx.v6addr = frent->fr_ip6->ip6_src;
1201                 (*frag)->fr_dstx.v6addr = frent->fr_ip6->ip6_dst;
1202                 (*frag)->fr_p = frent->fr_ip6f_opt.ip6f_nxt;
1203                 (*frag)->fr_id6 = frent->fr_ip6f_opt.ip6f_ident;
1204                 (*frag)->fr_timeout = pf_time_second();
1205                 if (csum_flags != 0) {
1206                         (*frag)->fr_csum_flags = csum_flags;
1207                         (*frag)->fr_csum = csum;
1208                 }
1209                 LIST_INIT(&(*frag)->fr_queue);
1210
1211                 RB_INSERT(pf_frag_tree, &pf_frag_tree, *frag);
1212                 TAILQ_INSERT_HEAD(&pf_fragqueue, *frag, frag_next);
1213
1214                 /* We do not have a previous fragment */
1215                 frep = NULL;
1216                 goto insert;
1217         }
1218
1219         /*
1220          * If this fragment contains similar checksum offload info
1221          * as that of the existing ones, accumulate checksum.  Otherwise,
1222          * invalidate checksum offload info for the entire datagram.
1223          */
1224         if (csum_flags != 0 && csum_flags == (*frag)->fr_csum_flags)
1225                 (*frag)->fr_csum += csum;
1226         else if ((*frag)->fr_csum_flags != 0)
1227                 (*frag)->fr_csum_flags = 0;
1228
1229         /*
1230          * Find a fragment after the current one:
1231          *  - off contains the real shifted offset.
1232          */
1233         LIST_FOREACH(frea, &(*frag)->fr_queue, fr_next) {
1234                 if (FR_IP6_OFF(frea) > off)
1235                         break;
1236                 frep = frea;
1237         }
1238
1239         VERIFY(frep != NULL || frea != NULL);
1240
1241         if (frep != NULL &&
1242             FR_IP6_OFF(frep) + FR_IP6_PLEN(frep) - frep->fr_ip6f_hlen > off)
1243         {
1244                 u_int16_t precut;
1245
1246                 precut = FR_IP6_OFF(frep) + FR_IP6_PLEN(frep) -
1247                     frep->fr_ip6f_hlen - off;
1248                 if (precut >= plen)
1249                         goto drop_fragment;
1250                 m_adj(frent->fr_m, precut);
1251                 DPFPRINTF(("overlap -%d\n", precut));
1252                 /* Enforce 8 byte boundaries */
1253                 frent->fr_ip6f_opt.ip6f_offlg =
1254                     htons(ntohs(frent->fr_ip6f_opt.ip6f_offlg) +
1255                     (precut >> 3));
1256                 off = FR_IP6_OFF(frent);
1257                 plen -= precut;
1258                 ip6->ip6_plen = htons(plen);
1259         }
1260
1261         for (; frea != NULL && plen + off > FR_IP6_OFF(frea); frea = next) {
1262                 u_int16_t       aftercut;
1263
1264                 aftercut = plen + off - FR_IP6_OFF(frea);
1265                 DPFPRINTF(("adjust overlap %d\n", aftercut));
1266                 if (aftercut < FR_IP6_PLEN(frea) - frea->fr_ip6f_hlen) {
1267                         frea->fr_ip6->ip6_plen = htons(FR_IP6_PLEN(frea) -
1268                                 aftercut);
1269                         frea->fr_ip6f_opt.ip6f_offlg =
1270                             htons(ntohs(frea->fr_ip6f_opt.ip6f_offlg) +
1271                             (aftercut >> 3));
1272                         m_adj(frea->fr_m, aftercut);
1273                         break;
1274                 }
1275
1276                 /* This fragment is completely overlapped, lose it */
1277                 next = LIST_NEXT(frea, fr_next);
1278                 m_freem(frea->fr_m);
1279                 LIST_REMOVE(frea, fr_next);
1280                 pool_put(&pf_frent_pl, frea);
1281                 pf_nfrents--;
1282         }
1283
1284   insert:
1285         /* Update maximum data size */
1286         if ((*frag)->fr_max < fr_max)
1287                 (*frag)->fr_max = fr_max;
1288         /* This is the last segment */
1289         if (!mff)
1290                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1291
1292         if (frep == NULL)
1293                 LIST_INSERT_HEAD(&(*frag)->fr_queue, frent, fr_next);
1294         else
1295                 LIST_INSERT_AFTER(frep, frent, fr_next);
1296
1297         /* Check if we are completely reassembled */
1298         if (!((*frag)->fr_flags & PFFRAG_SEENLAST))
1299                 return (NULL);
1300
1301         /* Check if we have all the data */
1302         off = 0;
1303         for (frep = LIST_FIRST(&(*frag)->fr_queue); frep; frep = next) {
1304                 next = LIST_NEXT(frep, fr_next);
1305                 off += FR_IP6_PLEN(frep) - (frent->fr_ip6f_hlen - sizeof *ip6);
1306                 DPFPRINTF(("frep at %d, next %d, max %d\n",
1307                         off, next == NULL ? -1 : FR_IP6_OFF(next),
1308                         (*frag)->fr_max));
1309                 if (off < (*frag)->fr_max &&
1310                     (next == NULL || FR_IP6_OFF(next) != off)) {
1311                         DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
1312                             off, next == NULL ? -1 : FR_IP6_OFF(next),
1313                             (*frag)->fr_max));
1314                         return (NULL);
1315                 }
1316         }
1317         DPFPRINTF(("%d < %d?\n", off, (*frag)->fr_max));
1318         if (off < (*frag)->fr_max)
1319                 return (NULL);
1320
1321         /* We have all the data */
1322         frent = LIST_FIRST(&(*frag)->fr_queue);
1323         VERIFY(frent != NULL);
1324         if (frent->fr_ip6f_hlen + off > IP_MAXPACKET) {
1325                 DPFPRINTF(("drop: too big: %d\n", off));
1326                 pf_free_fragment(*frag);
1327                 *frag = NULL;
1328                 return (NULL);
1329         }
1330
1331         ip6 = frent->fr_ip6;
1332         ip6->ip6_nxt = (*frag)->fr_p;
1333         ip6->ip6_plen = htons(off);
1334         ip6->ip6_src = (*frag)->fr_srcx.v6addr;
1335         ip6->ip6_dst = (*frag)->fr_dstx.v6addr;
1336
1337         if ((*frag)->fr_csum_flags != 0) {
1338                 csum = (*frag)->fr_csum;
1339
1340                 ADDCARRY(csum);
1341
1342                 m->m_pkthdr.csum_rx_val = csum;
1343                 m->m_pkthdr.csum_rx_start = sizeof (struct ip6_hdr);
1344                 m->m_pkthdr.csum_flags = (*frag)->fr_csum_flags;
1345         } else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) ||
1346             (m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1347                 /* loopback checksums are always OK */
1348                 m->m_pkthdr.csum_data = 0xffff;
1349                 m->m_pkthdr.csum_flags &= ~CSUM_PARTIAL;
1350                 m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1351         }
1352
1353         /* Remove from fragment queue */
1354         pf_remove_fragment(*frag);
1355         *frag = NULL;
1356
1357         m = frent->fr_m;
1358         m->m_len += sizeof(struct ip6_hdr);
1359         m->m_data -= sizeof(struct ip6_hdr);
1360         memmove(m->m_data, ip6, sizeof(struct ip6_hdr));
1361
1362         next = LIST_NEXT(frent, fr_next);
1363         pool_put(&pf_frent_pl, frent);
1364         pf_nfrents--;
1365         for (frent = next; next != NULL; frent = next) {
1366                 m2 = frent->fr_m;
1367
1368                 m_cat(m, m2);
1369                 next = LIST_NEXT(frent, fr_next);
1370                 pool_put(&pf_frent_pl, frent);
1371                 pf_nfrents--;
1372         }
1373
1374         /* XXX this should be done elsewhere */
1375         if (m->m_flags & M_PKTHDR) {
1376                 int pktlen = 0;
1377                 for (m2 = m; m2; m2 = m2->m_next)
1378                         pktlen += m2->m_len;
1379                 m->m_pkthdr.len = pktlen;
1380         }
1381
1382         DPFPRINTF(("complete: 0x%llx ip6_plen %d m_pkthdr.len %d\n",
1383             (uint64_t)VM_KERNEL_ADDRPERM(m), ntohs(ip6->ip6_plen),
1384             m->m_pkthdr.len));
1385
1386         return m;
1387
1388  drop_fragment:
1389         /* Oops - fail safe - drop packet */
1390         pool_put(&pf_frent_pl, frent);
1391         --pf_nfrents;
1392         m_freem(m);
1393         return NULL;
1394 }
1395
1396 static struct mbuf *
1397 pf_frag6cache(struct mbuf **m0, struct ip6_hdr *h, struct ip6_frag *fh,
1398     struct pf_fragment **frag, int hlen, int mff, int drop, int *nomem)
1399 {
1400         struct mbuf *m = *m0;
1401         u_int16_t plen, off, fr_max;
1402         struct pf_frcache *frp, *fra, *cur = NULL;
1403         int hosed = 0;
1404
1405         VERIFY(*frag == NULL || !BUFFER_FRAGMENTS(*frag));
1406         m = *m0;
1407         off = ntohs(fh->ip6f_offlg & IP6F_OFF_MASK);
1408         plen = ntohs(h->ip6_plen) - (hlen - sizeof *h);
1409
1410         /*
1411          * Apple Modification: dimambro@apple.com. The hlen, being passed
1412          * into this function Includes all the headers associated with
1413          * the packet, and may include routing headers, so to get to
1414          * the data payload as stored in the original IPv6 header we need
1415          * to subtract al those headers and the IP header.
1416          *
1417          * The 'max' local variable should also contain the offset from the start
1418          * of the reassembled packet to the octet just past the end of the octets
1419          * in the current fragment where:
1420          * - 'off' is the offset from the start of the reassembled packet to the
1421          *    first octet in the fragment,
1422          * - 'plen' is the length of the "payload data length" Excluding all the
1423          *   IPv6 headers of the fragment.
1424          * - 'hlen' is computed in pf_normalize_ip6() as the offset from the start
1425          *   of the IPv6 packet to the beginning of the data.
1426          */
1427         fr_max = off + plen;
1428
1429         DPFPRINTF(("0x%llx plen %u off %u fr_max %u\n",
1430             (uint64_t)VM_KERNEL_ADDRPERM(m), plen, off, fr_max));
1431
1432         /* Create a new range queue for this packet */
1433         if (*frag == NULL) {
1434                 *frag = pool_get(&pf_cache_pl, PR_NOWAIT);
1435                 if (*frag == NULL) {
1436                         pf_flush_fragments();
1437                         *frag = pool_get(&pf_cache_pl, PR_NOWAIT);
1438                         if (*frag == NULL)
1439                                 goto no_mem;
1440                 }
1441
1442                 /* Get an entry for the queue */
1443                 cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1444                 if (cur == NULL) {
1445                         pool_put(&pf_cache_pl, *frag);
1446                         *frag = NULL;
1447                         goto no_mem;
1448                 }
1449                 pf_ncache++;
1450
1451                 (*frag)->fr_flags = PFFRAG_NOBUFFER;
1452                 (*frag)->fr_max = 0;
1453                 (*frag)->fr_af = AF_INET6;
1454                 (*frag)->fr_srcx.v6addr = h->ip6_src;
1455                 (*frag)->fr_dstx.v6addr = h->ip6_dst;
1456                 (*frag)->fr_p = fh->ip6f_nxt;
1457                 (*frag)->fr_id6 = fh->ip6f_ident;
1458                 (*frag)->fr_timeout = pf_time_second();
1459
1460                 cur->fr_off = off;
1461                 cur->fr_end = fr_max;
1462                 LIST_INIT(&(*frag)->fr_cache);
1463                 LIST_INSERT_HEAD(&(*frag)->fr_cache, cur, fr_next);
1464
1465                 RB_INSERT(pf_frag_tree, &pf_cache_tree, *frag);
1466                 TAILQ_INSERT_HEAD(&pf_cachequeue, *frag, frag_next);
1467
1468                 DPFPRINTF(("frag6cache[%d]: new %d-%d\n", ntohl(fh->ip6f_ident),
1469                     off, fr_max));
1470
1471                 goto pass;
1472         }
1473
1474         /*
1475          * Find a fragment after the current one:
1476          *  - off contains the real shifted offset.
1477          */
1478         frp = NULL;
1479         LIST_FOREACH(fra, &(*frag)->fr_cache, fr_next) {
1480                 if (fra->fr_off > off)
1481                         break;
1482                 frp = fra;
1483         }
1484
1485         VERIFY(frp != NULL || fra != NULL);
1486
1487         if (frp != NULL) {
1488                 int precut;
1489
1490                 precut = frp->fr_end - off;
1491                 if (precut >= plen) {
1492                         /* Fragment is entirely a duplicate */
1493                         DPFPRINTF(("frag6cache[%u]: dead (%d-%d) %d-%d\n",
1494                             ntohl(fh->ip6f_ident), frp->fr_off, frp->fr_end,
1495                             off, fr_max));
1496                         goto drop_fragment;
1497                 }
1498                 if (precut == 0) {
1499                         /* They are adjacent.  Fixup cache entry */
1500                         DPFPRINTF(("frag6cache[%u]: adjacent (%d-%d) %d-%d\n",
1501                             ntohl(fh->ip6f_ident), frp->fr_off, frp->fr_end,
1502                             off, fr_max));
1503                         frp->fr_end = fr_max;
1504                 } else if (precut > 0) {
1505                         /* The first part of this payload overlaps with a
1506                          * fragment that has already been passed.
1507                          * Need to trim off the first part of the payload.
1508                          * But to do so easily, we need to create another
1509                          * mbuf to throw the original header into.
1510                          */
1511
1512                         DPFPRINTF(("frag6cache[%u]: chop %d (%d-%d) %d-%d\n",
1513                             ntohl(fh->ip6f_ident), precut, frp->fr_off,
1514                             frp->fr_end, off, fr_max));
1515
1516                         off += precut;
1517                         fr_max -= precut;
1518                         /* Update the previous frag to encompass this one */
1519                         frp->fr_end = fr_max;
1520
1521                         if (!drop) {
1522                                 /* XXX Optimization opportunity
1523                                  * This is a very heavy way to trim the payload.
1524                                  * we could do it much faster by diddling mbuf
1525                                  * internals but that would be even less legible
1526                                  * than this mbuf magic.  For my next trick,
1527                                  * I'll pull a rabbit out of my laptop.
1528                                  */
1529                                 *m0 = m_copym(m, 0, hlen, M_NOWAIT);
1530                                 if (*m0 == NULL)
1531                                         goto no_mem;
1532                                 VERIFY((*m0)->m_next == NULL);
1533                                 m_adj(m, precut + hlen);
1534                                 m_cat(*m0, m);
1535                                 m = *m0;
1536                                 if (m->m_flags & M_PKTHDR) {
1537                                         int pktlen = 0;
1538                                         struct mbuf *t;
1539                                         for (t = m; t; t = t->m_next)
1540                                                 pktlen += t->m_len;
1541                                         m->m_pkthdr.len = pktlen;
1542                                 }
1543
1544                                 h = mtod(m, struct ip6_hdr *);
1545
1546                                 VERIFY((int)m->m_len ==
1547                                     ntohs(h->ip6_plen) - precut);
1548                                 fh->ip6f_offlg &= ~IP6F_OFF_MASK;
1549                                 fh->ip6f_offlg |=
1550                                     htons(ntohs(fh->ip6f_offlg & IP6F_OFF_MASK)
1551                                     + (precut >> 3));
1552                                 h->ip6_plen = htons(ntohs(h->ip6_plen) -
1553                                     precut);
1554                         } else {
1555                                 hosed++;
1556                         }
1557                 } else {
1558                         /* There is a gap between fragments */
1559
1560                         DPFPRINTF(("frag6cache[%u]: gap %d (%d-%d) %d-%d\n",
1561                             ntohl(fh->ip6f_ident), -precut, frp->fr_off,
1562                             frp->fr_end, off, fr_max));
1563
1564                         cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1565                         if (cur == NULL)
1566                                 goto no_mem;
1567                         pf_ncache++;
1568
1569                         cur->fr_off = off;
1570                         cur->fr_end = fr_max;
1571                         LIST_INSERT_AFTER(frp, cur, fr_next);
1572                 }
1573         }
1574
1575         if (fra != NULL) {
1576                 int     aftercut;
1577                 int     merge = 0;
1578
1579                 aftercut = fr_max - fra->fr_off;
1580                 if (aftercut == 0) {
1581                         /* Adjacent fragments */
1582                         DPFPRINTF(("frag6cache[%u]: adjacent %d-%d (%d-%d)\n",
1583                             ntohl(fh->ip6f_ident), off, fr_max, fra->fr_off,
1584                             fra->fr_end));
1585                         fra->fr_off = off;
1586                         merge = 1;
1587                 } else if (aftercut > 0) {
1588                         /* Need to chop off the tail of this fragment */
1589                         DPFPRINTF(("frag6cache[%u]: chop %d %d-%d (%d-%d)\n",
1590                             ntohl(fh->ip6f_ident), aftercut, off, fr_max,
1591                             fra->fr_off, fra->fr_end));
1592                         fra->fr_off = off;
1593                         fr_max -= aftercut;
1594
1595                         merge = 1;
1596
1597                         if (!drop) {
1598                                 m_adj(m, -aftercut);
1599                                 if (m->m_flags & M_PKTHDR) {
1600                                         int pktlen = 0;
1601                                         struct mbuf *t;
1602                                         for (t = m; t; t = t->m_next)
1603                                                 pktlen += t->m_len;
1604                                         m->m_pkthdr.len = pktlen;
1605                                 }
1606                                 h = mtod(m, struct ip6_hdr *);
1607                                 VERIFY((int)m->m_len ==
1608                                     ntohs(h->ip6_plen) - aftercut);
1609                                 h->ip6_plen =
1610                                     htons(ntohs(h->ip6_plen) - aftercut);
1611                         } else {
1612                                 hosed++;
1613                         }
1614                 } else if (frp == NULL) {
1615                         /* There is a gap between fragments */
1616                         DPFPRINTF(("frag6cache[%u]: gap %d %d-%d (%d-%d)\n",
1617                             ntohl(fh->ip6f_ident), -aftercut, off, fr_max,
1618                             fra->fr_off, fra->fr_end));
1619
1620                         cur = pool_get(&pf_cent_pl, PR_NOWAIT);
1621                         if (cur == NULL)
1622                                 goto no_mem;
1623                         pf_ncache++;
1624
1625                         cur->fr_off = off;
1626                         cur->fr_end = fr_max;
1627                         LIST_INSERT_BEFORE(fra, cur, fr_next);
1628                 }
1629
1630                 /* Need to glue together two separate fragment descriptors */
1631                 if (merge) {
1632                         if (cur && fra->fr_off <= cur->fr_end) {
1633                                 /* Need to merge in a previous 'cur' */
1634                                 DPFPRINTF(("frag6cache[%u]: adjacent(merge "
1635                                     "%d-%d) %d-%d (%d-%d)\n",
1636                                     ntohl(fh->ip6f_ident), cur->fr_off,
1637                                     cur->fr_end, off, fr_max, fra->fr_off,
1638                                     fra->fr_end));
1639                                 fra->fr_off = cur->fr_off;
1640                                 LIST_REMOVE(cur, fr_next);
1641                                 pool_put(&pf_cent_pl, cur);
1642                                 pf_ncache--;
1643                                 cur = NULL;
1644                         } else if (frp && fra->fr_off <= frp->fr_end) {
1645                                 /* Need to merge in a modified 'frp' */
1646                                 VERIFY(cur == NULL);
1647                                 DPFPRINTF(("frag6cache[%u]: adjacent(merge "
1648                                     "%d-%d) %d-%d (%d-%d)\n",
1649                                     ntohl(fh->ip6f_ident), frp->fr_off,
1650                                     frp->fr_end, off, fr_max, fra->fr_off,
1651                                     fra->fr_end));
1652                                 fra->fr_off = frp->fr_off;
1653                                 LIST_REMOVE(frp, fr_next);
1654                                 pool_put(&pf_cent_pl, frp);
1655                                 pf_ncache--;
1656                                 frp = NULL;
1657                         }
1658                 }
1659         }
1660
1661         if (hosed) {
1662                 /*
1663                  * We must keep tracking the overall fragment even when
1664                  * we're going to drop it anyway so that we know when to
1665                  * free the overall descriptor.  Thus we drop the frag late.
1666                  */
1667                 goto drop_fragment;
1668         }
1669
1670  pass:
1671         /* Update maximum data size */
1672         if ((*frag)->fr_max < fr_max)
1673                 (*frag)->fr_max = fr_max;
1674
1675         /* This is the last segment */
1676         if (!mff)
1677                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1678
1679         /* Check if we are completely reassembled */
1680         if (((*frag)->fr_flags & PFFRAG_SEENLAST) &&
1681             LIST_FIRST(&(*frag)->fr_cache)->fr_off == 0 &&
1682             LIST_FIRST(&(*frag)->fr_cache)->fr_end == (*frag)->fr_max) {
1683                 /* Remove from fragment queue */
1684                 DPFPRINTF(("frag6cache[%u]: done 0-%d\n",
1685                     ntohl(fh->ip6f_ident), (*frag)->fr_max));
1686                 pf_free_fragment(*frag);
1687                 *frag = NULL;
1688         }
1689
1690         return (m);
1691
1692  no_mem:
1693         *nomem = 1;
1694
1695         /* Still need to pay attention to !IP_MF */
1696         if (!mff && *frag != NULL)
1697                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1698
1699         m_freem(m);
1700         return (NULL);
1701
1702  drop_fragment:
1703
1704         /* Still need to pay attention to !IP_MF */
1705         if (!mff && *frag != NULL)
1706                 (*frag)->fr_flags |= PFFRAG_SEENLAST;
1707
1708         if (drop) {
1709                 /* This fragment has been deemed bad.  Don't reass */
1710                 if (((*frag)->fr_flags & PFFRAG_DROP) == 0)
1711                         DPFPRINTF(("frag6cache[%u]: dropping overall fragment\n",
1712                             ntohl(fh->ip6f_ident)));
1713                 (*frag)->fr_flags |= PFFRAG_DROP;
1714         }
1715
1716         m_freem(m);
1717         return (NULL);
1718 }
1719
1720 int
1721 pf_normalize_ip(pbuf_t *pbuf, int dir, struct pfi_kif *kif, u_short *reason,
1722     struct pf_pdesc *pd)
1723 {
1724         struct mbuf             *m;
1725         struct pf_rule          *r;
1726         struct pf_frent         *frent;
1727         struct pf_fragment      *frag = NULL;
1728         struct ip               *h = pbuf->pb_data;
1729         int                      mff = (ntohs(h->ip_off) & IP_MF);
1730         int                      hlen = h->ip_hl << 2;
1731         u_int16_t                fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1732         u_int16_t                fr_max;
1733         int                      ip_len;
1734         int                      ip_off;
1735         int                      asd = 0;
1736         struct pf_ruleset       *ruleset = NULL;
1737         struct ifnet            *ifp = pbuf->pb_ifp;
1738
1739         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
1740         while (r != NULL) {
1741                 r->evaluations++;
1742                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
1743                         r = r->skip[PF_SKIP_IFP].ptr;
1744                 else if (r->direction && r->direction != dir)
1745                         r = r->skip[PF_SKIP_DIR].ptr;
1746                 else if (r->af && r->af != AF_INET)
1747                         r = r->skip[PF_SKIP_AF].ptr;
1748                 else if (r->proto && r->proto != h->ip_p)
1749                         r = r->skip[PF_SKIP_PROTO].ptr;
1750                 else if (PF_MISMATCHAW(&r->src.addr,
1751                     (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
1752                     r->src.neg, kif))
1753                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
1754                 else if (PF_MISMATCHAW(&r->dst.addr,
1755                     (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
1756                     r->dst.neg, NULL))
1757                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
1758                 else {
1759                         if (r->anchor == NULL)
1760                                 break;
1761                         else
1762                                 pf_step_into_anchor(&asd, &ruleset,
1763                                     PF_RULESET_SCRUB, &r, NULL, NULL);
1764                 }
1765                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
1766                     PF_RULESET_SCRUB, &r, NULL, NULL))
1767                         break;
1768         }
1769
1770         if (r == NULL || r->action == PF_NOSCRUB)
1771                 return (PF_PASS);
1772         else {
1773                 r->packets[dir == PF_OUT]++;
1774                 r->bytes[dir == PF_OUT] += pd->tot_len;
1775         }
1776
1777         /* Check for illegal packets */
1778         if (hlen < (int)sizeof (struct ip))
1779                 goto drop;
1780
1781         if (hlen > ntohs(h->ip_len))
1782                 goto drop;
1783
1784         /* Clear IP_DF if the rule uses the no-df option */
1785         if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
1786                 u_int16_t ipoff = h->ip_off;
1787
1788                 h->ip_off &= htons(~IP_DF);
1789                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ipoff, h->ip_off, 0);
1790         }
1791
1792         /* We will need other tests here */
1793         if (!fragoff && !mff)
1794                 goto no_fragment;
1795
1796         /*
1797          * We're dealing with a fragment now. Don't allow fragments
1798          * with IP_DF to enter the cache. If the flag was cleared by
1799          * no-df above, fine. Otherwise drop it.
1800          */
1801         if (h->ip_off & htons(IP_DF)) {
1802                 DPFPRINTF(("IP_DF\n"));
1803                 goto bad;
1804         }
1805
1806         ip_len = ntohs(h->ip_len) - hlen;
1807         ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
1808
1809         /* All fragments are 8 byte aligned */
1810         if (mff && (ip_len & 0x7)) {
1811                 DPFPRINTF(("mff and %d\n", ip_len));
1812                 goto bad;
1813         }
1814
1815         /* Respect maximum length */
1816         if (fragoff + ip_len > IP_MAXPACKET) {
1817                 DPFPRINTF(("max packet %d\n", fragoff + ip_len));
1818                 goto bad;
1819         }
1820         fr_max = fragoff + ip_len;
1821
1822         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
1823                 /* Fully buffer all of the fragments */
1824
1825                 frag = pf_find_fragment_by_ipv4_header(h, &pf_frag_tree);
1826                 /* Check if we saw the last fragment already */
1827                 if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1828                     fr_max > frag->fr_max)
1829                         goto bad;
1830
1831                 if ((m = pbuf_to_mbuf(pbuf, TRUE)) == NULL) {
1832                         REASON_SET(reason, PFRES_MEMORY);
1833                         return (PF_DROP);
1834                 }
1835
1836                 VERIFY(!pbuf_is_valid(pbuf));
1837
1838                 /* Restore iph pointer after pbuf_to_mbuf() */
1839                 h = mtod(m, struct ip *);
1840
1841                 /* Get an entry for the fragment queue */
1842                 frent = pool_get(&pf_frent_pl, PR_NOWAIT);
1843                 if (frent == NULL) {
1844                         REASON_SET(reason, PFRES_MEMORY);
1845                         m_freem(m);
1846                         return (PF_DROP);
1847                 }
1848                 pf_nfrents++;
1849                 frent->fr_ip = h;
1850                 frent->fr_m = m;
1851
1852                 /* Might return a completely reassembled mbuf, or NULL */
1853                 DPFPRINTF(("reass IPv4 frag %d @ %d-%d\n", ntohs(h->ip_id),
1854                     fragoff, fr_max));
1855                 m = pf_reassemble(m, &frag, frent, mff);
1856
1857                 if (m == NULL)
1858                         return (PF_DROP);
1859
1860                 VERIFY(m->m_flags & M_PKTHDR);
1861                 pbuf_init_mbuf(pbuf, m, ifp);
1862
1863                 /* use mtag from concatenated mbuf chain */
1864                 pd->pf_mtag = pf_find_mtag_pbuf(pbuf);
1865 #if 0
1866 // SCW: This check is superfluous
1867 #if DIAGNOSTIC
1868                 if (pd->pf_mtag == NULL) {
1869                         printf("%s: pf_find_mtag returned NULL(1)\n", __func__);
1870                         if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1871                                 m_freem(m);
1872                                 m = NULL;
1873                                 goto no_mem;
1874                         }
1875                 }
1876 #endif
1877 #endif
1878
1879                 h = mtod(m, struct ip *);
1880
1881                 if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1882                         goto drop;
1883         } else {
1884                 /* non-buffering fragment cache (drops or masks overlaps) */
1885                 int     nomem = 0;
1886
1887                 if (dir == PF_OUT && (pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) {
1888                         /*
1889                          * Already passed the fragment cache in the
1890                          * input direction.  If we continued, it would
1891                          * appear to be a dup and would be dropped.
1892                          */
1893                         goto fragment_pass;
1894                 }
1895
1896                 frag = pf_find_fragment_by_ipv4_header(h, &pf_cache_tree);
1897
1898                 /* Check if we saw the last fragment already */
1899                 if (frag != NULL && (frag->fr_flags & PFFRAG_SEENLAST) &&
1900                     fr_max > frag->fr_max) {
1901                         if (r->rule_flag & PFRULE_FRAGDROP)
1902                                 frag->fr_flags |= PFFRAG_DROP;
1903                         goto bad;
1904                 }
1905
1906                 if ((m = pbuf_to_mbuf(pbuf, TRUE)) == NULL) {
1907                         REASON_SET(reason, PFRES_MEMORY);
1908                         goto bad;
1909                 }
1910
1911                 VERIFY(!pbuf_is_valid(pbuf));
1912
1913                 /* Restore iph pointer after pbuf_to_mbuf() */
1914                 h = mtod(m, struct ip *);
1915
1916                 m = pf_fragcache(&m, h, &frag, mff,
1917                     (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
1918                 if (m == NULL) {
1919                         // Note: pf_fragcache() has already m_freem'd the mbuf
1920                         if (nomem)
1921                                 goto no_mem;
1922                         goto drop;
1923                 }
1924
1925                 VERIFY(m->m_flags & M_PKTHDR);
1926                 pbuf_init_mbuf(pbuf, m, ifp);
1927
1928                 /* use mtag from copied and trimmed mbuf chain */
1929                 pd->pf_mtag = pf_find_mtag_pbuf(pbuf);
1930 #if 0
1931 // SCW: This check is superfluous
1932 #if DIAGNOSTIC
1933                 if (pd->pf_mtag == NULL) {
1934                         printf("%s: pf_find_mtag returned NULL(2)\n", __func__);
1935                         if ((pd->pf_mtag = pf_get_mtag(m)) == NULL) {
1936                                 m_freem(m);
1937                                 m = NULL;
1938                                 goto no_mem;
1939                         }
1940                 }
1941 #endif
1942 #endif
1943                 if (dir == PF_IN)
1944                         pd->pf_mtag->pftag_flags |= PF_TAG_FRAGCACHE;
1945
1946                 if (frag != NULL && (frag->fr_flags & PFFRAG_DROP))
1947                         goto drop;
1948
1949                 goto fragment_pass;
1950         }
1951
1952 no_fragment:
1953         /* At this point, only IP_DF is allowed in ip_off */
1954         if (h->ip_off & ~htons(IP_DF)) {
1955                 u_int16_t ipoff = h->ip_off;
1956
1957                 h->ip_off &= htons(IP_DF);
1958                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ipoff, h->ip_off, 0);
1959         }
1960
1961         /* Enforce a minimum ttl, may cause endless packet loops */
1962         if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1963                 u_int16_t ip_ttl = h->ip_ttl;
1964
1965                 h->ip_ttl = r->min_ttl;
1966                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1967         }
1968         if (r->rule_flag & PFRULE_RANDOMID) {
1969                 u_int16_t oip_id = h->ip_id;
1970
1971                 if (rfc6864 && IP_OFF_IS_ATOMIC(ntohs(h->ip_off))) {
1972                         h->ip_id = 0;
1973                 } else {
1974                         h->ip_id = ip_randomid();
1975                 }
1976                 h->ip_sum = pf_cksum_fixup(h->ip_sum, oip_id, h->ip_id, 0);
1977         }
1978         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1979                 pd->flags |= PFDESC_IP_REAS;
1980
1981         return (PF_PASS);
1982
1983 fragment_pass:
1984         /* Enforce a minimum ttl, may cause endless packet loops */
1985         if (r->min_ttl && h->ip_ttl < r->min_ttl) {
1986                 u_int16_t ip_ttl = h->ip_ttl;
1987
1988                 h->ip_ttl = r->min_ttl;
1989                 h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
1990         }
1991         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0)
1992                 pd->flags |= PFDESC_IP_REAS;
1993         return (PF_PASS);
1994
1995 no_mem:
1996         REASON_SET(reason, PFRES_MEMORY);
1997         if (r != NULL && r->log && pbuf_is_valid(pbuf))
1998                 PFLOG_PACKET(kif, h, pbuf, AF_INET, dir, *reason, r,
1999                     NULL, NULL, pd);
2000         return (PF_DROP);
2001
2002 drop:
2003         REASON_SET(reason, PFRES_NORM);
2004         if (r != NULL && r->log && pbuf_is_valid(pbuf))
2005                 PFLOG_PACKET(kif, h, pbuf, AF_INET, dir, *reason, r,
2006                     NULL, NULL, pd);
2007         return (PF_DROP);
2008
2009 bad:
2010         DPFPRINTF(("dropping bad IPv4 fragment\n"));
2011
2012         /* Free associated fragments */
2013         if (frag != NULL)
2014                 pf_free_fragment(frag);
2015
2016         REASON_SET(reason, PFRES_FRAG);
2017         if (r != NULL && r->log && pbuf_is_valid(pbuf))
2018                 PFLOG_PACKET(kif, h, pbuf, AF_INET, dir, *reason, r, NULL, NULL, pd);
2019
2020         return (PF_DROP);
2021 }
2022
2023 #if INET6
2024 int
2025 pf_normalize_ip6(pbuf_t *pbuf, int dir, struct pfi_kif *kif,
2026     u_short *reason, struct pf_pdesc *pd)
2027 {
2028         struct mbuf             *m;
2029         struct pf_rule          *r;
2030         struct ip6_hdr          *h = pbuf->pb_data;
2031         int                      off;
2032         struct ip6_ext           ext;
2033 /* adi XXX */
2034 #if 0
2035         struct ip6_opt           opt;
2036         struct ip6_opt_jumbo     jumbo;
2037         int                      optend;
2038         int                      ooff;
2039 #endif
2040         struct ip6_frag          frag;
2041         u_int32_t                jumbolen = 0, plen;
2042         u_int16_t                fragoff = 0;
2043         u_int8_t                 proto;
2044         int                      terminal;
2045         struct pf_frent         *frent;
2046         struct pf_fragment      *pff = NULL;
2047         int                      mff = 0, rh_cnt = 0;
2048         u_int16_t                fr_max;
2049         int                      asd = 0;
2050         struct pf_ruleset       *ruleset = NULL;
2051         struct ifnet            *ifp = pbuf->pb_ifp;
2052
2053         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
2054         while (r != NULL) {
2055                 r->evaluations++;
2056                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
2057                         r = r->skip[PF_SKIP_IFP].ptr;
2058                 else if (r->direction && r->direction != dir)
2059                         r = r->skip[PF_SKIP_DIR].ptr;
2060                 else if (r->af && r->af != AF_INET6)
2061                         r = r->skip[PF_SKIP_AF].ptr;
2062 #if 0 /* header chain! */
2063                 else if (r->proto && r->proto != h->ip6_nxt)
2064                         r = r->skip[PF_SKIP_PROTO].ptr;
2065 #endif
2066                 else if (PF_MISMATCHAW(&r->src.addr,
2067                     (struct pf_addr *)(uintptr_t)&h->ip6_src, AF_INET6,
2068                     r->src.neg, kif))
2069                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
2070                 else if (PF_MISMATCHAW(&r->dst.addr,
2071                     (struct pf_addr *)(uintptr_t)&h->ip6_dst, AF_INET6,
2072                     r->dst.neg, NULL))
2073                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
2074                 else {
2075                         if (r->anchor == NULL)
2076                                 break;
2077                         else
2078                                 pf_step_into_anchor(&asd, &ruleset,
2079                                     PF_RULESET_SCRUB, &r, NULL, NULL);
2080                 }
2081                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
2082                     PF_RULESET_SCRUB, &r, NULL, NULL))
2083                         break;
2084         }
2085
2086         if (r == NULL || r->action == PF_NOSCRUB)
2087                 return (PF_PASS);
2088         else {
2089                 r->packets[dir == PF_OUT]++;
2090                 r->bytes[dir == PF_OUT] += pd->tot_len;
2091         }
2092
2093         /* Check for illegal packets */
2094         if ((uint32_t)(sizeof (struct ip6_hdr) + IPV6_MAXPACKET) <
2095             pbuf->pb_packet_len)
2096                 goto drop;
2097
2098         off = sizeof (struct ip6_hdr);
2099         proto = h->ip6_nxt;
2100         terminal = 0;
2101         do {
2102                 pd->proto = proto;
2103                 switch (proto) {
2104                 case IPPROTO_FRAGMENT:
2105                         goto fragment;
2106                 case IPPROTO_AH:
2107                 case IPPROTO_ROUTING:
2108                 case IPPROTO_DSTOPTS:
2109                         if (!pf_pull_hdr(pbuf, off, &ext, sizeof (ext), NULL,
2110                             NULL, AF_INET6))
2111                                 goto shortpkt;
2112                         /*
2113                          * <jhw@apple.com>
2114                          * Multiple routing headers not allowed.
2115                          * Routing header type zero considered harmful.
2116                          */
2117                         if (proto == IPPROTO_ROUTING) {
2118                                 const struct ip6_rthdr *rh =
2119                                     (const struct ip6_rthdr *)&ext;
2120                                 if (rh_cnt++)
2121                                         goto drop;
2122                                 if (rh->ip6r_type == IPV6_RTHDR_TYPE_0)
2123                                         goto drop;
2124                         }
2125                         else
2126                         if (proto == IPPROTO_AH)
2127                                 off += (ext.ip6e_len + 2) * 4;
2128                         else
2129                                 off += (ext.ip6e_len + 1) * 8;
2130                         proto = ext.ip6e_nxt;
2131                         break;
2132                 case IPPROTO_HOPOPTS:
2133 /* adi XXX */
2134 #if 0
2135                         if (!pf_pull_hdr(m, off, &ext, sizeof (ext), NULL,
2136                             NULL, AF_INET6))
2137                                 goto shortpkt;
2138                         optend = off + (ext.ip6e_len + 1) * 8;
2139                         ooff = off + sizeof (ext);
2140                         do {
2141                                 if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
2142                                     sizeof (opt.ip6o_type), NULL, NULL,
2143                                     AF_INET6))
2144                                         goto shortpkt;
2145                                 if (opt.ip6o_type == IP6OPT_PAD1) {
2146                                         ooff++;
2147                                         continue;
2148                                 }
2149                                 if (!pf_pull_hdr(m, ooff, &opt, sizeof (opt),
2150                                     NULL, NULL, AF_INET6))
2151                                         goto shortpkt;
2152                                 if (ooff + sizeof (opt) + opt.ip6o_len > optend)
2153                                         goto drop;
2154                                 switch (opt.ip6o_type) {
2155                                 case IP6OPT_JUMBO:
2156                                         if (h->ip6_plen != 0)
2157                                                 goto drop;
2158                                         if (!pf_pull_hdr(m, ooff, &jumbo,
2159                                             sizeof (jumbo), NULL, NULL,
2160                                             AF_INET6))
2161                                                 goto shortpkt;
2162                                         memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
2163                                             sizeof (jumbolen));
2164                                         jumbolen = ntohl(jumbolen);
2165                                         if (jumbolen <= IPV6_MAXPACKET)
2166                                                 goto drop;
2167                                         if (sizeof (struct ip6_hdr) +
2168                                             jumbolen != m->m_pkthdr.len)
2169                                                 goto drop;
2170                                         break;
2171                                 default:
2172                                         break;
2173                                 }
2174                                 ooff += sizeof (opt) + opt.ip6o_len;
2175                         } while (ooff < optend);
2176
2177                         off = optend;
2178                         proto = ext.ip6e_nxt;
2179                         break;
2180 #endif
2181                 default:
2182                         terminal = 1;
2183                         break;
2184                 }
2185         } while (!terminal);
2186
2187         /* jumbo payload option must be present, or plen > 0 */
2188         if (ntohs(h->ip6_plen) == 0)
2189                 plen = jumbolen;
2190         else
2191                 plen = ntohs(h->ip6_plen);
2192         if (plen == 0)
2193                 goto drop;
2194         if ((uint32_t)(sizeof (struct ip6_hdr) + plen) > pbuf->pb_packet_len)
2195                 goto shortpkt;
2196
2197         /* Enforce a minimum ttl, may cause endless packet loops */
2198         if (r->min_ttl && h->ip6_hlim < r->min_ttl)
2199                 h->ip6_hlim = r->min_ttl;
2200
2201         return (PF_PASS);
2202
2203 fragment:
2204         if (ntohs(h->ip6_plen) == 0 || jumbolen)
2205                 goto drop;
2206         plen = ntohs(h->ip6_plen);
2207
2208         if (!pf_pull_hdr(pbuf, off, &frag, sizeof (frag), NULL, NULL, AF_INET6))
2209                 goto shortpkt;
2210         fragoff = ntohs(frag.ip6f_offlg & IP6F_OFF_MASK);
2211         pd->proto = frag.ip6f_nxt;
2212         mff = ntohs(frag.ip6f_offlg & IP6F_MORE_FRAG);
2213         off += sizeof frag;
2214         if (fragoff + (plen - off) > IPV6_MAXPACKET)
2215                goto badfrag;
2216
2217         fr_max = fragoff + plen - (off - sizeof(struct ip6_hdr));
2218 // XXX SCW: mbuf-specific
2219 //      DPFPRINTF(("0x%llx IPv6 frag plen %u mff %d off %u fragoff %u "
2220 //          "fr_max %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, mff, off,
2221 //          fragoff, fr_max));
2222
2223         if ((r->rule_flag & (PFRULE_FRAGCROP|PFRULE_FRAGDROP)) == 0) {
2224                 /* Fully buffer all of the fragments */
2225                 pd->flags |= PFDESC_IP_REAS;
2226
2227                 pff = pf_find_fragment_by_ipv6_header(h, &frag,
2228                    &pf_frag_tree);
2229
2230                 /* Check if we saw the last fragment already */
2231                 if (pff != NULL && (pff->fr_flags & PFFRAG_SEENLAST) &&
2232                     fr_max > pff->fr_max)
2233                         goto badfrag;
2234
2235                 if ((m = pbuf_to_mbuf(pbuf, TRUE)) == NULL) {
2236                         REASON_SET(reason, PFRES_MEMORY);
2237                         return (PF_DROP);
2238                 }
2239
2240                 /* Restore iph pointer after pbuf_to_mbuf() */
2241                 h = mtod(m, struct ip6_hdr *);
2242
2243                 /* Get an entry for the fragment queue */
2244                 frent = pool_get(&pf_frent_pl, PR_NOWAIT);
2245                 if (frent == NULL) {
2246                         REASON_SET(reason, PFRES_MEMORY);
2247                         return (PF_DROP);
2248                 }
2249
2250                 pf_nfrents++;
2251                 frent->fr_ip6 = h;
2252                 frent->fr_m = m;
2253                 frent->fr_ip6f_opt = frag;
2254                 frent->fr_ip6f_hlen = off;
2255
2256                 /* Might return a completely reassembled mbuf, or NULL */
2257                 DPFPRINTF(("reass IPv6 frag %d @ %d-%d\n",
2258                      ntohl(frag.ip6f_ident), fragoff, fr_max));
2259                 m = pf_reassemble6(&m, &pff, frent, mff);
2260
2261                 if (m == NULL)
2262                         return (PF_DROP);
2263
2264                 pbuf_init_mbuf(pbuf, m, ifp);
2265                 h = pbuf->pb_data;
2266
2267                 if (pff != NULL && (pff->fr_flags & PFFRAG_DROP))
2268                         goto drop;
2269         }
2270         else if (dir == PF_IN || !(pd->pf_mtag->pftag_flags & PF_TAG_FRAGCACHE)) {
2271                 /* non-buffering fragment cache (overlaps: see RFC 5722) */
2272                 int nomem = 0;
2273
2274                 pff = pf_find_fragment_by_ipv6_header(h, &frag,
2275                     &pf_cache_tree);
2276
2277                 /* Check if we saw the last fragment already */
2278                 if (pff != NULL && (pff->fr_flags & PFFRAG_SEENLAST) &&
2279                     fr_max > pff->fr_max) {
2280                        if (r->rule_flag & PFRULE_FRAGDROP)
2281                                 pff->fr_flags |= PFFRAG_DROP;
2282                        goto badfrag;
2283                 }
2284
2285                 if ((m = pbuf_to_mbuf(pbuf, TRUE)) == NULL) {
2286                         goto no_mem;
2287                 }
2288
2289                 /* Restore iph pointer after pbuf_to_mbuf() */
2290                 h = mtod(m, struct ip6_hdr *);
2291
2292                 m = pf_frag6cache(&m, h, &frag, &pff, off, mff,
2293                      (r->rule_flag & PFRULE_FRAGDROP) ? 1 : 0, &nomem);
2294                 if (m == NULL) {
2295                         // Note: pf_frag6cache() has already m_freem'd the mbuf
2296                         if (nomem)
2297                                 goto no_mem;
2298                         goto drop;
2299                 }
2300
2301                 pbuf_init_mbuf(pbuf, m, ifp);
2302                 pd->pf_mtag = pf_find_mtag_pbuf(pbuf);
2303                 h = pbuf->pb_data;
2304
2305                 if (dir == PF_IN)
2306                         pd->pf_mtag->pftag_flags |= PF_TAG_FRAGCACHE;
2307
2308                 if (pff != NULL && (pff->fr_flags & PFFRAG_DROP))
2309                         goto drop;
2310         }
2311
2312         /* Enforce a minimum ttl, may cause endless packet loops */
2313         if (r->min_ttl && h->ip6_hlim < r->min_ttl)
2314                 h->ip6_hlim = r->min_ttl;
2315         return (PF_PASS);
2316
2317   no_mem:
2318         REASON_SET(reason, PFRES_MEMORY);
2319         goto dropout;
2320
2321   shortpkt:
2322         REASON_SET(reason, PFRES_SHORT);
2323         goto dropout;
2324
2325   drop:
2326         REASON_SET(reason, PFRES_NORM);
2327         goto dropout;
2328
2329   badfrag:
2330         DPFPRINTF(("dropping bad IPv6 fragment\n"));
2331         REASON_SET(reason, PFRES_FRAG);
2332         goto dropout;
2333
2334   dropout:
2335         if (pff != NULL)
2336                 pf_free_fragment(pff);
2337         if (r != NULL && r->log && pbuf_is_valid(pbuf))
2338                 PFLOG_PACKET(kif, h, pbuf, AF_INET6, dir, *reason, r, NULL, NULL, pd);
2339         return (PF_DROP);
2340 }
2341 #endif /* INET6 */
2342
2343 int
2344 pf_normalize_tcp(int dir, struct pfi_kif *kif, pbuf_t *pbuf, int ipoff,
2345     int off, void *h, struct pf_pdesc *pd)
2346 {
2347 #pragma unused(ipoff, h)
2348         struct pf_rule  *r, *rm = NULL;
2349         struct tcphdr   *th = pd->hdr.tcp;
2350         int              rewrite = 0;
2351         int              asd = 0;
2352         u_short          reason;
2353         u_int8_t         flags;
2354         sa_family_t      af = pd->af;
2355         struct pf_ruleset *ruleset = NULL;
2356         union pf_state_xport sxport, dxport;
2357
2358         sxport.port = th->th_sport;
2359         dxport.port = th->th_dport;
2360
2361         r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
2362         while (r != NULL) {
2363                 r->evaluations++;
2364                 if (pfi_kif_match(r->kif, kif) == r->ifnot)
2365                         r = r->skip[PF_SKIP_IFP].ptr;
2366                 else if (r->direction && r->direction != dir)
2367                         r = r->skip[PF_SKIP_DIR].ptr;
2368                 else if (r->af && r->af != af)
2369                         r = r->skip[PF_SKIP_AF].ptr;
2370                 else if (r->proto && r->proto != pd->proto)
2371                         r = r->skip[PF_SKIP_PROTO].ptr;
2372                 else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
2373                     r->src.neg, kif))
2374                         r = r->skip[PF_SKIP_SRC_ADDR].ptr;
2375                 else if (r->src.xport.range.op &&
2376                     !pf_match_xport(r->src.xport.range.op, r->proto_variant,
2377                     &r->src.xport, &sxport))
2378                         r = r->skip[PF_SKIP_SRC_PORT].ptr;
2379                 else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
2380                     r->dst.neg, NULL))
2381                         r = r->skip[PF_SKIP_DST_ADDR].ptr;
2382                 else if (r->dst.xport.range.op &&
2383                     !pf_match_xport(r->dst.xport.range.op, r->proto_variant,
2384                     &r->dst.xport, &dxport))
2385                         r = r->skip[PF_SKIP_DST_PORT].ptr;
2386                 else if (r->os_fingerprint != PF_OSFP_ANY &&
2387                     !pf_osfp_match(pf_osfp_fingerprint(pd, pbuf, off, th),
2388                     r->os_fingerprint))
2389                         r = TAILQ_NEXT(r, entries);
2390                 else {
2391                         if (r->anchor == NULL) {
2392                                 rm = r;
2393                                 break;
2394                         } else {
2395                                 pf_step_into_anchor(&asd, &ruleset,
2396                                     PF_RULESET_SCRUB, &r, NULL, NULL);
2397                         }
2398                 }
2399                 if (r == NULL && pf_step_out_of_anchor(&asd, &ruleset,
2400                     PF_RULESET_SCRUB, &r, NULL, NULL))
2401                         break;
2402         }
2403
2404         if (rm == NULL || rm->action == PF_NOSCRUB)
2405                 return (PF_PASS);
2406         else {
2407                 r->packets[dir == PF_OUT]++;
2408                 r->bytes[dir == PF_OUT] += pd->tot_len;
2409         }
2410
2411         if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
2412                 pd->flags |= PFDESC_TCP_NORM;
2413
2414         flags = th->th_flags;
2415         if (flags & TH_SYN) {
2416                 /* Illegal packet */
2417                 if (flags & TH_RST)
2418                         goto tcp_drop;
2419
2420                 if (flags & TH_FIN)
2421                         flags &= ~TH_FIN;
2422         } else {
2423                 /* Illegal packet */
2424                 if (!(flags & (TH_ACK|TH_RST)))
2425                         goto tcp_drop;
2426         }
2427
2428         if (!(flags & TH_ACK)) {
2429                 /* These flags are only valid if ACK is set */
2430                 if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
2431                         goto tcp_drop;
2432         }
2433
2434         /* Check for illegal header length */
2435         if (th->th_off < (sizeof (struct tcphdr) >> 2))
2436                 goto tcp_drop;
2437
2438         /* If flags changed, or reserved data set, then adjust */
2439         if (flags != th->th_flags || th->th_x2 != 0) {
2440                 u_int16_t       ov, nv;
2441
2442                 ov = *(u_int16_t *)(&th->th_ack + 1);
2443                 th->th_flags = flags;
2444                 th->th_x2 = 0;
2445                 nv = *(u_int16_t *)(&th->th_ack + 1);
2446
2447                 th->th_sum = pf_cksum_fixup(th->th_sum, ov, nv, 0);
2448                 rewrite = 1;
2449         }
2450
2451         /* Remove urgent pointer, if TH_URG is not set */
2452         if (!(flags & TH_URG) && th->th_urp) {
2453                 th->th_sum = pf_cksum_fixup(th->th_sum, th->th_urp, 0, 0);
2454                 th->th_urp = 0;
2455                 rewrite = 1;
2456         }
2457
2458         /* copy back packet headers if we sanitized */
2459         /* Process options */
2460         if (r->max_mss) {
2461                 int rv = pf_normalize_tcpopt(r, dir, kif, pd, pbuf, th, off,
2462                     &rewrite);
2463                 if (rv == PF_DROP)
2464                         return rv;
2465                 pbuf = pd->mp;
2466         }
2467
2468         if (rewrite) {
2469                 if (pf_lazy_makewritable(pd, pbuf,
2470                     off + sizeof (*th)) == NULL) {
2471                         REASON_SET(&reason, PFRES_MEMORY);
2472                         if (r->log)
2473                                 PFLOG_PACKET(kif, h, pbuf, AF_INET, dir, reason,
2474                                     r, 0, 0, pd);
2475                         return PF_DROP;
2476                 }
2477
2478                 pbuf_copy_back(pbuf, off, sizeof (*th), th);
2479         }
2480
2481         return (PF_PASS);
2482
2483 tcp_drop:
2484         REASON_SET(&reason, PFRES_NORM);
2485         if (rm != NULL && r->log)
2486                 PFLOG_PACKET(kif, h, pbuf, AF_INET, dir, reason, r, NULL, NULL, pd);
2487         return (PF_DROP);
2488 }
2489
2490 int
2491 pf_normalize_tcp_init(pbuf_t *pbuf, int off, struct pf_pdesc *pd,
2492     struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
2493 {
2494 #pragma unused(dst)
2495         u_int32_t tsval, tsecr;
2496         u_int8_t hdr[60];
2497         u_int8_t *opt;
2498
2499         VERIFY(src->scrub == NULL);
2500
2501         src->scrub = pool_get(&pf_state_scrub_pl, PR_NOWAIT);
2502         if (src->scrub == NULL)
2503                 return (1);
2504         bzero(src->scrub, sizeof (*src->scrub));
2505
2506         switch (pd->af) {
2507 #if INET
2508         case AF_INET: {
2509                 struct ip *h = pbuf->pb_data;
2510                 src->scrub->pfss_ttl = h->ip_ttl;
2511                 break;
2512         }
2513 #endif /* INET */
2514 #if INET6
2515         case AF_INET6: {
2516                 struct ip6_hdr *h = pbuf->pb_data;
2517                 src->scrub->pfss_ttl = h->ip6_hlim;
2518                 break;
2519         }
2520 #endif /* INET6 */
2521         }
2522
2523
2524         /*
2525          * All normalizations below are only begun if we see the start of
2526          * the connections.  They must all set an enabled bit in pfss_flags
2527          */
2528         if ((th->th_flags & TH_SYN) == 0)
2529                 return (0);
2530
2531
2532         if (th->th_off > (sizeof (struct tcphdr) >> 2) && src->scrub &&
2533             pf_pull_hdr(pbuf, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
2534                 /* Diddle with TCP options */
2535                 int hlen;
2536                 opt = hdr + sizeof (struct tcphdr);
2537                 hlen = (th->th_off << 2) - sizeof (struct tcphdr);
2538                 while (hlen >= TCPOLEN_TIMESTAMP) {
2539                         switch (*opt) {
2540                         case TCPOPT_EOL:        /* FALLTHROUGH */
2541                         case TCPOPT_NOP:
2542                                 opt++;
2543                                 hlen--;
2544                                 break;
2545                         case TCPOPT_TIMESTAMP:
2546                                 if (opt[1] >= TCPOLEN_TIMESTAMP) {
2547                                         src->scrub->pfss_flags |=
2548                                             PFSS_TIMESTAMP;
2549                                         src->scrub->pfss_ts_mod =
2550                                             htonl(random());
2551
2552                                         /* note PFSS_PAWS not set yet */
2553                                         memcpy(&tsval, &opt[2],
2554                                             sizeof (u_int32_t));
2555                                         memcpy(&tsecr, &opt[6],
2556                                             sizeof (u_int32_t));
2557                                         src->scrub->pfss_tsval0 = ntohl(tsval);
2558                                         src->scrub->pfss_tsval = ntohl(tsval);
2559                                         src->scrub->pfss_tsecr = ntohl(tsecr);
2560                                         getmicrouptime(&src->scrub->pfss_last);
2561                                 }
2562                                 /* FALLTHROUGH */
2563                         default:
2564                                 hlen -= MAX(opt[1], 2);
2565                                 opt += MAX(opt[1], 2);
2566                                 break;
2567                         }
2568                 }
2569         }
2570
2571         return (0);
2572 }
2573
2574 void
2575 pf_normalize_tcp_cleanup(struct pf_state *state)
2576 {
2577         if (state->src.scrub)
2578                 pool_put(&pf_state_scrub_pl, state->src.scrub);
2579         if (state->dst.scrub)
2580                 pool_put(&pf_state_scrub_pl, state->dst.scrub);
2581
2582         /* Someday... flush the TCP segment reassembly descriptors. */
2583 }
2584
2585 int
2586 pf_normalize_tcp_stateful(pbuf_t *pbuf, int off, struct pf_pdesc *pd,
2587     u_short *reason, struct tcphdr *th, struct pf_state *state,
2588     struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
2589 {
2590         struct timeval uptime;
2591         u_int32_t tsval = 0, tsecr = 0;
2592         u_int tsval_from_last;
2593         u_int8_t hdr[60];
2594         u_int8_t *opt;
2595         int copyback = 0;
2596         int got_ts = 0;
2597
2598         VERIFY(src->scrub || dst->scrub);
2599
2600         /*
2601          * Enforce the minimum TTL seen for this connection.  Negate a common
2602          * technique to evade an intrusion detection system and confuse
2603          * firewall state code.
2604          */
2605         switch (pd->af) {
2606 #if INET
2607         case AF_INET: {
2608                 if (src->scrub) {
2609                         struct ip *h = pbuf->pb_data;
2610                         if (h->ip_ttl > src->scrub->pfss_ttl)
2611                                 src->scrub->pfss_ttl = h->ip_ttl;
2612                         h->ip_ttl = src->scrub->pfss_ttl;
2613                 }
2614                 break;
2615         }
2616 #endif /* INET */
2617 #if INET6
2618         case AF_INET6: {
2619                 if (src->scrub) {
2620                         struct ip6_hdr *h = pbuf->pb_data;
2621                         if (h->ip6_hlim > src->scrub->pfss_ttl)
2622                                 src->scrub->pfss_ttl = h->ip6_hlim;
2623                         h->ip6_hlim = src->scrub->pfss_ttl;
2624                 }
2625                 break;
2626         }
2627 #endif /* INET6 */
2628         }
2629
2630         if (th->th_off > (sizeof (struct tcphdr) >> 2) &&
2631             ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
2632             (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
2633             pf_pull_hdr(pbuf, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
2634                 /* Diddle with TCP options */
2635                 int hlen;
2636                 opt = hdr + sizeof (struct tcphdr);
2637                 hlen = (th->th_off << 2) - sizeof (struct tcphdr);
2638                 while (hlen >= TCPOLEN_TIMESTAMP) {
2639                         switch (*opt) {
2640                         case TCPOPT_EOL:        /* FALLTHROUGH */
2641                         case TCPOPT_NOP:
2642                                 opt++;
2643                                 hlen--;
2644                                 break;
2645                         case TCPOPT_TIMESTAMP:
2646                                 /*
2647                                  * Modulate the timestamps.  Can be used for
2648                                  * NAT detection, OS uptime determination or
2649                                  * reboot detection.
2650                                  */
2651
2652                                 if (got_ts) {
2653                                         /* Huh?  Multiple timestamps!? */
2654                                         if (pf_status.debug >= PF_DEBUG_MISC) {
2655                                                 DPFPRINTF(("multiple TS??"));
2656                                                 pf_print_state(state);
2657                                                 printf("\n");
2658                                         }
2659                                         REASON_SET(reason, PFRES_TS);
2660                                         return (PF_DROP);
2661                                 }
2662                                 if (opt[1] >= TCPOLEN_TIMESTAMP) {
2663                                         memcpy(&tsval, &opt[2],
2664                                             sizeof (u_int32_t));
2665                                         if (tsval && src->scrub &&
2666                                             (src->scrub->pfss_flags &
2667                                             PFSS_TIMESTAMP)) {
2668                                                 tsval = ntohl(tsval);
2669                                                 pf_change_a(&opt[2],
2670                                                     &th->th_sum,
2671                                                     htonl(tsval +
2672                                                     src->scrub->pfss_ts_mod),
2673                                                     0);
2674                                                 copyback = 1;
2675                                         }
2676
2677                                         /* Modulate TS reply iff valid (!0) */
2678                                         memcpy(&tsecr, &opt[6],
2679                                             sizeof (u_int32_t));
2680                                         if (tsecr && dst->scrub &&
2681                                             (dst->scrub->pfss_flags &
2682                                             PFSS_TIMESTAMP)) {
2683                                                 tsecr = ntohl(tsecr)
2684                                                     - dst->scrub->pfss_ts_mod;
2685                                                 pf_change_a(&opt[6],
2686                                                     &th->th_sum, htonl(tsecr),
2687                                                     0);
2688                                                 copyback = 1;
2689                                         }
2690                                         got_ts = 1;
2691                                 }
2692                                 /* FALLTHROUGH */
2693                         default:
2694                                 hlen -= MAX(opt[1], 2);
2695                                 opt += MAX(opt[1], 2);
2696                                 break;
2697                         }
2698                 }
2699                 if (copyback) {
2700                         /* Copyback the options, caller copys back header */
2701                         int optoff = off + sizeof (*th);
2702                         int optlen = (th->th_off << 2) - sizeof (*th);
2703                         if (pf_lazy_makewritable(pd, pbuf, optoff + optlen) ==
2704                             NULL) {
2705                                 REASON_SET(reason, PFRES_MEMORY);
2706                                 return PF_DROP;
2707                         }
2708                         *writeback = optoff + optlen;
2709                         pbuf_copy_back(pbuf, optoff, optlen, hdr + sizeof(*th));
2710                 }
2711         }
2712
2713
2714         /*
2715          * Must invalidate PAWS checks on connections idle for too long.
2716          * The fastest allowed timestamp clock is 1ms.  That turns out to
2717          * be about 24 days before it wraps.  XXX Right now our lowerbound
2718          * TS echo check only works for the first 12 days of a connection
2719          * when the TS has exhausted half its 32bit space
2720          */
2721 #define TS_MAX_IDLE     (24*24*60*60)
2722 #define TS_MAX_CONN     (12*24*60*60)   /* XXX remove when better tsecr check */
2723
2724         getmicrouptime(&uptime);
2725         if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
2726             (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
2727             pf_time_second() - state->creation > TS_MAX_CONN))  {
2728                 if (pf_status.debug >= PF_DEBUG_MISC) {
2729                         DPFPRINTF(("src idled out of PAWS\n"));
2730                         pf_print_state(state);
2731                         printf("\n");
2732                 }
2733                 src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
2734                     | PFSS_PAWS_IDLED;
2735         }
2736         if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
2737             uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
2738                 if (pf_status.debug >= PF_DEBUG_MISC) {
2739                         DPFPRINTF(("dst idled out of PAWS\n"));
2740                         pf_print_state(state);
2741                         printf("\n");
2742                 }
2743                 dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
2744                     | PFSS_PAWS_IDLED;
2745         }
2746
2747         if (got_ts && src->scrub && dst->scrub &&
2748             (src->scrub->pfss_flags & PFSS_PAWS) &&
2749             (dst->scrub->pfss_flags & PFSS_PAWS)) {
2750                 /*
2751                  * Validate that the timestamps are "in-window".
2752                  * RFC1323 describes TCP Timestamp options that allow
2753                  * measurement of RTT (round trip time) and PAWS
2754                  * (protection against wrapped sequence numbers).  PAWS
2755                  * gives us a set of rules for rejecting packets on
2756                  * long fat pipes (packets that were somehow delayed
2757                  * in transit longer than the time it took to send the
2758                  * full TCP sequence space of 4Gb).  We can use these
2759                  * rules and infer a few others that will let us treat
2760                  * the 32bit timestamp and the 32bit echoed timestamp
2761                  * as sequence numbers to prevent a blind attacker from
2762                  * inserting packets into a connection.
2763                  *
2764                  * RFC1323 tells us:
2765                  *  - The timestamp on this packet must be greater than
2766                  *    or equal to the last value echoed by the other
2767                  *    endpoint.  The RFC says those will be discarded
2768                  *    since it is a dup that has already been acked.
2769                  *    This gives us a lowerbound on the timestamp.
2770                  *        timestamp >= other last echoed timestamp
2771                  *  - The timestamp will be less than or equal to
2772                  *    the last timestamp plus the time between the
2773                  *    last packet and now.  The RFC defines the max
2774                  *    clock rate as 1ms.  We will allow clocks to be
2775                  *    up to 10% fast and will allow a total difference
2776                  *    or 30 seconds due to a route change.  And this
2777                  *    gives us an upperbound on the timestamp.
2778                  *        timestamp <= last timestamp + max ticks
2779                  *    We have to be careful here.  Windows will send an
2780                  *    initial timestamp of zero and then initialize it
2781                  *    to a random value after the 3whs; presumably to
2782                  *    avoid a DoS by having to call an expensive RNG
2783                  *    during a SYN flood.  Proof MS has at least one
2784                  *    good security geek.
2785                  *
2786                  *  - The TCP timestamp option must also echo the other
2787                  *    endpoints timestamp.  The timestamp echoed is the
2788                  *    one carried on the earliest unacknowledged segment
2789                  *    on the left edge of the sequence window.  The RFC
2790                  *    states that the host will reject any echoed
2791                  *    timestamps that were larger than any ever sent.
2792                  *    This gives us an upperbound on the TS echo.
2793                  *        tescr <= largest_tsval
2794                  *  - The lowerbound on the TS echo is a little more
2795                  *    tricky to determine.  The other endpoint's echoed
2796                  *    values will not decrease.  But there may be
2797                  *    network conditions that re-order packets and
2798                  *    cause our view of them to decrease.  For now the
2799                  *    only lowerbound we can safely determine is that
2800                  *    the TS echo will never be less than the original
2801                  *    TS.  XXX There is probably a better lowerbound.
2802                  *    Remove TS_MAX_CONN with better lowerbound check.
2803                  *        tescr >= other original TS
2804                  *
2805                  * It is also important to note that the fastest
2806                  * timestamp clock of 1ms will wrap its 32bit space in
2807                  * 24 days.  So we just disable TS checking after 24
2808                  * days of idle time.  We actually must use a 12d
2809                  * connection limit until we can come up with a better
2810                  * lowerbound to the TS echo check.
2811                  */
2812                 struct timeval delta_ts;
2813                 int ts_fudge;
2814
2815
2816                 /*
2817                  * PFTM_TS_DIFF is how many seconds of leeway to allow
2818                  * a host's timestamp.  This can happen if the previous
2819                  * packet got delayed in transit for much longer than
2820                  * this packet.
2821                  */
2822                 if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
2823                         ts_fudge = pf_default_rule.timeout[PFTM_TS_DIFF];
2824
2825
2826                 /* Calculate max ticks since the last timestamp */
2827 #define TS_MAXFREQ      1100            /* RFC max TS freq of 1Khz + 10% skew */
2828 #define TS_MICROSECS    1000000         /* microseconds per second */
2829                 timersub(&uptime, &src->scrub->pfss_last, &delta_ts);
2830                 tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
2831                 tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
2832
2833
2834                 if ((src->state >= TCPS_ESTABLISHED &&
2835                     dst->state >= TCPS_ESTABLISHED) &&
2836                     (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
2837                     SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
2838                     (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
2839                     SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
2840                         /*
2841                          * Bad RFC1323 implementation or an insertion attack.
2842                          *
2843                          * - Solaris 2.6 and 2.7 are known to send another ACK
2844                          *   after the FIN,FIN|ACK,ACK closing that carries
2845                          *   an old timestamp.
2846                          */
2847
2848                         DPFPRINTF(("Timestamp failed %c%c%c%c\n",
2849                             SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
2850                             SEQ_GT(tsval, src->scrub->pfss_tsval +
2851                             tsval_from_last) ? '1' : ' ',
2852                             SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
2853                             SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
2854                         DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
2855                             "idle: %lus %ums\n",
2856                             tsval, tsecr, tsval_from_last, delta_ts.tv_sec,
2857                             delta_ts.tv_usec / 1000));
2858                         DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
2859                             src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
2860                         DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u\n",
2861                             dst->scrub->pfss_tsval, dst->scrub->pfss_tsecr,
2862                             dst->scrub->pfss_tsval0));
2863                         if (pf_status.debug >= PF_DEBUG_MISC) {
2864                                 pf_print_state(state);
2865                                 pf_print_flags(th->th_flags);
2866                                 printf("\n");
2867                         }
2868                         REASON_SET(reason, PFRES_TS);
2869                         return (PF_DROP);
2870                 }
2871
2872                 /* XXX I'd really like to require tsecr but it's optional */
2873
2874         } else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
2875             ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
2876             || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
2877             src->scrub && dst->scrub &&
2878             (src->scrub->pfss_flags & PFSS_PAWS) &&
2879             (dst->scrub->pfss_flags & PFSS_PAWS)) {
2880                 /*
2881                  * Didn't send a timestamp.  Timestamps aren't really useful
2882                  * when:
2883                  *  - connection opening or closing (often not even sent).
2884                  *    but we must not let an attacker to put a FIN on a
2885                  *    data packet to sneak it through our ESTABLISHED check.
2886                  *  - on a TCP reset.  RFC suggests not even looking at TS.
2887                  *  - on an empty ACK.  The TS will not be echoed so it will
2888                  *    probably not help keep the RTT calculation in sync and
2889                  *    there isn't as much danger when the sequence numbers
2890                  *    got wrapped.  So some stacks don't include TS on empty
2891                  *    ACKs :-(
2892                  *
2893                  * To minimize the disruption to mostly RFC1323 conformant
2894                  * stacks, we will only require timestamps on data packets.
2895                  *
2896                  * And what do ya know, we cannot require timestamps on data
2897                  * packets.  There appear to be devices that do legitimate
2898                  * TCP connection hijacking.  There are HTTP devices that allow
2899                  * a 3whs (with timestamps) and then buffer the HTTP request.
2900                  * If the intermediate device has the HTTP response cache, it
2901                  * will spoof the response but not bother timestamping its
2902                  * packets.  So we can look for the presence of a timestamp in
2903                  * the first data packet and if there, require it in all future
2904                  * packets.
2905                  */
2906
2907                 if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
2908                         /*
2909                          * Hey!  Someone tried to sneak a packet in.  Or the
2910                          * stack changed its RFC1323 behavior?!?!
2911                          */
2912                         if (pf_status.debug >= PF_DEBUG_MISC) {
2913                                 DPFPRINTF(("Did not receive expected RFC1323 "
2914                                     "timestamp\n"));
2915                                 pf_print_state(state);
2916                                 pf_print_flags(th->th_flags);
2917                                 printf("\n");
2918                         }
2919                         REASON_SET(reason, PFRES_TS);
2920                         return (PF_DROP);
2921                 }
2922         }
2923
2924
2925         /*
2926          * We will note if a host sends his data packets with or without
2927          * timestamps.  And require all data packets to contain a timestamp
2928          * if the first does.  PAWS implicitly requires that all data packets be
2929          * timestamped.  But I think there are middle-man devices that hijack
2930          * TCP streams immediately after the 3whs and don't timestamp their
2931          * packets (seen in a WWW accelerator or cache).
2932          */
2933         if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
2934             (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
2935                 if (got_ts)
2936                         src->scrub->pfss_flags |= PFSS_DATA_TS;
2937                 else {
2938                         src->scrub->pfss_flags |= PFSS_DATA_NOTS;
2939                         if (pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
2940                             (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
2941                                 /* Don't warn if other host rejected RFC1323 */
2942                                 DPFPRINTF(("Broken RFC1323 stack did not "
2943                                     "timestamp data packet. Disabled PAWS "
2944                                     "security.\n"));
2945                                 pf_print_state(state);
2946                                 pf_print_flags(th->th_flags);
2947                                 printf("\n");
2948                         }
2949                 }
2950         }
2951
2952
2953         /*
2954          * Update PAWS values
2955          */
2956         if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
2957             (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
2958                 getmicrouptime(&src->scrub->pfss_last);
2959                 if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
2960                     (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2961                         src->scrub->pfss_tsval = tsval;
2962
2963                 if (tsecr) {
2964                         if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
2965                             (src->scrub->pfss_flags & PFSS_PAWS) == 0)
2966                                 src->scrub->pfss_tsecr = tsecr;
2967
2968                         if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
2969                             (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
2970                             src->scrub->pfss_tsval0 == 0)) {
2971                                 /* tsval0 MUST be the lowest timestamp */
2972                                 src->scrub->pfss_tsval0 = tsval;
2973                         }
2974
2975                         /* Only fully initialized after a TS gets echoed */
2976                         if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
2977                                 src->scrub->pfss_flags |= PFSS_PAWS;
2978                 }
2979         }
2980
2981         /* I have a dream....  TCP segment reassembly.... */
2982         return (0);
2983 }
2984
2985 static int
2986 pf_normalize_tcpopt(struct pf_rule *r, int dir, struct pfi_kif *kif,
2987     struct pf_pdesc *pd, pbuf_t *pbuf, struct tcphdr *th, int off,
2988     int *rewrptr)
2989 {
2990 #pragma unused(dir, kif)
2991         sa_family_t af = pd->af;
2992         u_int16_t       *mss;
2993         int             thoff;
2994         int             opt, cnt, optlen = 0;
2995         int             rewrite = 0;
2996         u_char          opts[MAX_TCPOPTLEN];
2997         u_char          *optp = opts;
2998
2999         thoff = th->th_off << 2;
3000         cnt = thoff - sizeof (struct tcphdr);
3001
3002         if (cnt > 0 && !pf_pull_hdr(pbuf, off + sizeof (*th), opts, cnt,
3003             NULL, NULL, af))
3004                 return PF_DROP;
3005
3006         for (; cnt > 0; cnt -= optlen, optp += optlen) {
3007                 opt = optp[0];
3008                 if (opt == TCPOPT_EOL)
3009                         break;
3010                 if (opt == TCPOPT_NOP)
3011                         optlen = 1;
3012                 else {
3013                         if (cnt < 2)
3014                                 break;
3015                         optlen = optp[1];
3016                         if (optlen < 2 || optlen > cnt)
3017                                 break;
3018                 }
3019                 switch (opt) {
3020                 case TCPOPT_MAXSEG:
3021                         mss = (u_int16_t *)(void *)(optp + 2);
3022                         if ((ntohs(*mss)) > r->max_mss) {
3023                                 /*
3024                                  * <jhw@apple.com>
3025                                  *  Only do the TCP checksum fixup if delayed
3026                                  * checksum calculation will not be performed.
3027                                  */
3028                                 if (pbuf->pb_ifp ||
3029                                     !(*pbuf->pb_csum_flags & CSUM_TCP))
3030                                         th->th_sum = pf_cksum_fixup(th->th_sum,
3031                                             *mss, htons(r->max_mss), 0);
3032                                 *mss = htons(r->max_mss);
3033                                 rewrite = 1;
3034                         }
3035                         break;
3036                 default:
3037                         break;
3038                 }
3039         }
3040
3041         if (rewrite) {
3042                 u_short reason;
3043
3044                 VERIFY(pbuf == pd->mp);
3045
3046                 if (pf_lazy_makewritable(pd, pd->mp,
3047                     off + sizeof (*th) + thoff) == NULL) {
3048                         REASON_SET(&reason, PFRES_MEMORY);
3049                         if (r->log)
3050                                 PFLOG_PACKET(kif, h, pbuf, AF_INET, dir, reason,
3051                                     r, 0, 0, pd);
3052                         return PF_DROP;
3053                 }
3054
3055                 *rewrptr = 1;
3056                 pbuf_copy_back(pd->mp, off + sizeof (*th), thoff - sizeof (*th), opts);
3057         }
3058
3059         return PF_PASS;
3060 }