]>
git.saurik.com Git - apple/xnu.git/blob - bsd/net/pf_norm.c
2 * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 /* $apfw: pf_norm.c,v 1.10 2008/08/28 19:10:53 jhw Exp $ */
30 /* $OpenBSD: pf_norm.c,v 1.107 2006/04/16 00:59:52 pascoe Exp $ */
33 * Copyright 2001 Niels Provos <provos@citi.umich.edu>
34 * All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
45 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
46 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
47 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
48 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
49 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
50 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
54 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
57 #include <sys/param.h>
58 #include <sys/systm.h>
60 #include <sys/filio.h>
61 #include <sys/fcntl.h>
62 #include <sys/socket.h>
63 #include <sys/kernel.h>
65 #include <sys/random.h>
66 #include <sys/mcache.h>
69 #include <net/if_types.h>
71 #include <net/route.h>
72 #include <net/if_pflog.h>
74 #include <netinet/in.h>
75 #include <netinet/in_var.h>
76 #include <netinet/in_systm.h>
77 #include <netinet/ip.h>
78 #include <netinet/ip_var.h>
79 #include <netinet/tcp.h>
80 #include <netinet/tcp_seq.h>
81 #include <netinet/tcp_fsm.h>
82 #include <netinet/udp.h>
83 #include <netinet/ip_icmp.h>
86 #include <netinet/ip6.h>
87 #include <netinet6/ip6_var.h>
90 #include <net/pfvar.h>
93 LIST_ENTRY(pf_frent
) fr_next
;
95 #define fr_ip fr_u.fru_ipv4
96 #define fr_ip6 fr_u.fru_ipv6
99 struct ip6_hdr
*fru_ipv6
;
101 struct ip6_frag fr_ip6f_opt
;
102 uint16_t fr_ip6f_hlen
; /* total header length */
103 uint16_t fr_ip6f_extoff
; /* last extension header offset or 0 */
107 LIST_ENTRY(pf_frcache
) fr_next
;
112 #define PFFRAG_SEENLAST 0x0001 /* Seen the last fragment for this */
113 #define PFFRAG_NOBUFFER 0x0002 /* Non-buffering fragment cache */
114 #define PFFRAG_DROP 0x0004 /* Drop all fragments */
115 #define BUFFER_FRAGMENTS(fr) (!((fr)->fr_flags & PFFRAG_NOBUFFER))
118 RB_ENTRY(pf_fragment
) fr_entry
;
119 TAILQ_ENTRY(pf_fragment
) frag_next
;
120 struct pf_addr fr_srcx
;
121 struct pf_addr fr_dstx
;
122 u_int8_t fr_p
; /* protocol of this fragment */
123 u_int8_t fr_flags
; /* status flags */
124 u_int16_t fr_max
; /* fragment data max */
125 #define fr_id fr_uid.fru_id4
126 #define fr_id6 fr_uid.fru_id6
132 u_int32_t fr_timeout
;
133 #define fr_queue fr_u.fru_queue
134 #define fr_cache fr_u.fru_cache
136 LIST_HEAD(pf_fragq
, pf_frent
) fru_queue
; /* buffering */
137 LIST_HEAD(pf_cacheq
, pf_frcache
) fru_cache
; /* non-buf */
139 uint32_t fr_csum_flags
; /* checksum flags */
140 uint32_t fr_csum
; /* partial checksum value */
141 uint16_t fr_ip6_maxlen
; /* maximum length of a single fragment in IPv6 */
144 static TAILQ_HEAD(pf_fragqueue
, pf_fragment
) pf_fragqueue
;
145 static TAILQ_HEAD(pf_cachequeue
, pf_fragment
) pf_cachequeue
;
147 static __inline
int pf_frag_compare(struct pf_fragment
*,
148 struct pf_fragment
*);
149 static RB_HEAD(pf_frag_tree
, pf_fragment
) pf_frag_tree
, pf_cache_tree
;
150 RB_PROTOTYPE_SC(__private_extern__
, pf_frag_tree
, pf_fragment
, fr_entry
,
152 RB_GENERATE(pf_frag_tree
, pf_fragment
, fr_entry
, pf_frag_compare
);
154 /* Private prototypes */
155 static void pf_ip6hdr2key(struct pf_fragment
*, struct ip6_hdr
*,
157 static void pf_ip2key(struct pf_fragment
*, struct ip
*);
158 static void pf_remove_fragment(struct pf_fragment
*);
159 static void pf_flush_fragments(void);
160 static void pf_free_fragment(struct pf_fragment
*);
161 static struct pf_fragment
*pf_find_fragment_by_key(struct pf_fragment
*,
162 struct pf_frag_tree
*);
163 static __inline
struct pf_fragment
*
164 pf_find_fragment_by_ipv4_header(struct ip
*, struct pf_frag_tree
*);
165 static struct mbuf
*pf_reassemble(struct mbuf
*, struct pf_fragment
**,
166 struct pf_frent
*, int);
167 static struct mbuf
*pf_fragcache(struct mbuf
**, struct ip
*,
168 struct pf_fragment
**, int, int, int *);
169 static int pf_normalize_tcpopt(struct pf_rule
*, int, struct pfi_kif
*,
170 struct pf_pdesc
*, pbuf_t
*, struct tcphdr
*, int, int *);
172 static __inline
struct pf_fragment
*
173 pf_find_fragment_by_ipv6_header(struct ip6_hdr
*, struct ip6_frag
*,
174 struct pf_frag_tree
*);
175 static struct mbuf
*pf_reassemble6(struct mbuf
**, struct pf_fragment
**,
176 struct pf_frent
*, int);
177 static struct mbuf
*pf_frag6cache(struct mbuf
**, struct ip6_hdr
*,
178 struct ip6_frag
*, struct pf_fragment
**, int, int, int, int *);
181 #define DPFPRINTF(x) do { \
182 if (pf_status.debug >= PF_DEBUG_MISC) { \
183 printf("%s: ", __func__); \
189 struct pool pf_frent_pl
, pf_frag_pl
;
190 static struct pool pf_cache_pl
, pf_cent_pl
;
191 struct pool pf_state_scrub_pl
;
193 static int pf_nfrents
, pf_ncache
;
196 pf_normalize_init(void)
198 pool_init(&pf_frent_pl
, sizeof(struct pf_frent
), 0, 0, 0, "pffrent",
200 pool_init(&pf_frag_pl
, sizeof(struct pf_fragment
), 0, 0, 0, "pffrag",
202 pool_init(&pf_cache_pl
, sizeof(struct pf_fragment
), 0, 0, 0,
204 pool_init(&pf_cent_pl
, sizeof(struct pf_frcache
), 0, 0, 0, "pffrcent",
206 pool_init(&pf_state_scrub_pl
, sizeof(struct pf_state_scrub
), 0, 0, 0,
209 pool_sethiwat(&pf_frag_pl
, PFFRAG_FRAG_HIWAT
);
210 pool_sethardlimit(&pf_frent_pl
, PFFRAG_FRENT_HIWAT
, NULL
, 0);
211 pool_sethardlimit(&pf_cache_pl
, PFFRAG_FRCACHE_HIWAT
, NULL
, 0);
212 pool_sethardlimit(&pf_cent_pl
, PFFRAG_FRCENT_HIWAT
, NULL
, 0);
214 TAILQ_INIT(&pf_fragqueue
);
215 TAILQ_INIT(&pf_cachequeue
);
220 pf_normalize_destroy(void)
222 pool_destroy(&pf_state_scrub_pl
);
223 pool_destroy(&pf_cent_pl
);
224 pool_destroy(&pf_cache_pl
);
225 pool_destroy(&pf_frag_pl
);
226 pool_destroy(&pf_frent_pl
);
231 pf_normalize_isempty(void)
233 return TAILQ_EMPTY(&pf_fragqueue
) && TAILQ_EMPTY(&pf_cachequeue
);
237 pf_frag_compare(struct pf_fragment
*a
, struct pf_fragment
*b
)
241 if ((diff
= a
->fr_af
- b
->fr_af
)) {
243 } else if ((diff
= a
->fr_p
- b
->fr_p
)) {
246 struct pf_addr
*sa
= &a
->fr_srcx
;
247 struct pf_addr
*sb
= &b
->fr_srcx
;
248 struct pf_addr
*da
= &a
->fr_dstx
;
249 struct pf_addr
*db
= &b
->fr_dstx
;
254 if ((diff
= a
->fr_id
- b
->fr_id
)) {
256 } else if (sa
->v4addr
.s_addr
< sb
->v4addr
.s_addr
) {
258 } else if (sa
->v4addr
.s_addr
> sb
->v4addr
.s_addr
) {
260 } else if (da
->v4addr
.s_addr
< db
->v4addr
.s_addr
) {
262 } else if (da
->v4addr
.s_addr
> db
->v4addr
.s_addr
) {
269 if ((diff
= a
->fr_id6
- b
->fr_id6
)) {
271 } else if (sa
->addr32
[3] < sb
->addr32
[3]) {
273 } else if (sa
->addr32
[3] > sb
->addr32
[3]) {
275 } else if (sa
->addr32
[2] < sb
->addr32
[2]) {
277 } else if (sa
->addr32
[2] > sb
->addr32
[2]) {
279 } else if (sa
->addr32
[1] < sb
->addr32
[1]) {
281 } else if (sa
->addr32
[1] > sb
->addr32
[1]) {
283 } else if (sa
->addr32
[0] < sb
->addr32
[0]) {
285 } else if (sa
->addr32
[0] > sb
->addr32
[0]) {
287 } else if (da
->addr32
[3] < db
->addr32
[3]) {
289 } else if (da
->addr32
[3] > db
->addr32
[3]) {
291 } else if (da
->addr32
[2] < db
->addr32
[2]) {
293 } else if (da
->addr32
[2] > db
->addr32
[2]) {
295 } else if (da
->addr32
[1] < db
->addr32
[1]) {
297 } else if (da
->addr32
[1] > db
->addr32
[1]) {
299 } else if (da
->addr32
[0] < db
->addr32
[0]) {
301 } else if (da
->addr32
[0] > db
->addr32
[0]) {
307 VERIFY(!0 && "only IPv4 and IPv6 supported!");
315 pf_purge_expired_fragments(void)
317 struct pf_fragment
*frag
;
318 u_int32_t expire
= pf_time_second() -
319 pf_default_rule
.timeout
[PFTM_FRAG
];
321 while ((frag
= TAILQ_LAST(&pf_fragqueue
, pf_fragqueue
)) != NULL
) {
322 VERIFY(BUFFER_FRAGMENTS(frag
));
323 if (frag
->fr_timeout
> expire
) {
327 switch (frag
->fr_af
) {
329 DPFPRINTF(("expiring IPv4 %d(0x%llx) from queue.\n",
331 (uint64_t)VM_KERNEL_ADDRPERM(frag
)));
334 DPFPRINTF(("expiring IPv6 %d(0x%llx) from queue.\n",
336 (uint64_t)VM_KERNEL_ADDRPERM(frag
)));
339 VERIFY(0 && "only IPv4 and IPv6 supported");
342 pf_free_fragment(frag
);
345 while ((frag
= TAILQ_LAST(&pf_cachequeue
, pf_cachequeue
)) != NULL
) {
346 VERIFY(!BUFFER_FRAGMENTS(frag
));
347 if (frag
->fr_timeout
> expire
) {
351 switch (frag
->fr_af
) {
353 DPFPRINTF(("expiring IPv4 %d(0x%llx) from cache.\n",
355 (uint64_t)VM_KERNEL_ADDRPERM(frag
)));
358 DPFPRINTF(("expiring IPv6 %d(0x%llx) from cache.\n",
360 (uint64_t)VM_KERNEL_ADDRPERM(frag
)));
363 VERIFY(0 && "only IPv4 and IPv6 supported");
366 pf_free_fragment(frag
);
367 VERIFY(TAILQ_EMPTY(&pf_cachequeue
) ||
368 TAILQ_LAST(&pf_cachequeue
, pf_cachequeue
) != frag
);
373 * Try to flush old fragments to make space for new ones
377 pf_flush_fragments(void)
379 struct pf_fragment
*frag
;
382 goal
= pf_nfrents
* 9 / 10;
383 DPFPRINTF(("trying to free > %d frents\n",
385 while (goal
< pf_nfrents
) {
386 frag
= TAILQ_LAST(&pf_fragqueue
, pf_fragqueue
);
390 pf_free_fragment(frag
);
394 goal
= pf_ncache
* 9 / 10;
395 DPFPRINTF(("trying to free > %d cache entries\n",
397 while (goal
< pf_ncache
) {
398 frag
= TAILQ_LAST(&pf_cachequeue
, pf_cachequeue
);
402 pf_free_fragment(frag
);
406 /* Frees the fragments and all associated entries */
409 pf_free_fragment(struct pf_fragment
*frag
)
411 struct pf_frent
*frent
;
412 struct pf_frcache
*frcache
;
414 /* Free all fragments */
415 if (BUFFER_FRAGMENTS(frag
)) {
416 for (frent
= LIST_FIRST(&frag
->fr_queue
); frent
;
417 frent
= LIST_FIRST(&frag
->fr_queue
)) {
418 LIST_REMOVE(frent
, fr_next
);
420 m_freem(frent
->fr_m
);
421 pool_put(&pf_frent_pl
, frent
);
425 for (frcache
= LIST_FIRST(&frag
->fr_cache
); frcache
;
426 frcache
= LIST_FIRST(&frag
->fr_cache
)) {
427 LIST_REMOVE(frcache
, fr_next
);
429 VERIFY(LIST_EMPTY(&frag
->fr_cache
) ||
430 LIST_FIRST(&frag
->fr_cache
)->fr_off
>
433 pool_put(&pf_cent_pl
, frcache
);
438 pf_remove_fragment(frag
);
442 pf_ip6hdr2key(struct pf_fragment
*key
, struct ip6_hdr
*ip6
,
445 key
->fr_p
= fh
->ip6f_nxt
;
446 key
->fr_id6
= fh
->ip6f_ident
;
447 key
->fr_af
= AF_INET6
;
448 key
->fr_srcx
.v6addr
= ip6
->ip6_src
;
449 key
->fr_dstx
.v6addr
= ip6
->ip6_dst
;
453 pf_ip2key(struct pf_fragment
*key
, struct ip
*ip
)
455 key
->fr_p
= ip
->ip_p
;
456 key
->fr_id
= ip
->ip_id
;
457 key
->fr_af
= AF_INET
;
458 key
->fr_srcx
.v4addr
.s_addr
= ip
->ip_src
.s_addr
;
459 key
->fr_dstx
.v4addr
.s_addr
= ip
->ip_dst
.s_addr
;
462 static struct pf_fragment
*
463 pf_find_fragment_by_key(struct pf_fragment
*key
, struct pf_frag_tree
*tree
)
465 struct pf_fragment
*frag
;
467 frag
= RB_FIND(pf_frag_tree
, tree
, key
);
469 /* XXX Are we sure we want to update the timeout? */
470 frag
->fr_timeout
= pf_time_second();
471 if (BUFFER_FRAGMENTS(frag
)) {
472 TAILQ_REMOVE(&pf_fragqueue
, frag
, frag_next
);
473 TAILQ_INSERT_HEAD(&pf_fragqueue
, frag
, frag_next
);
475 TAILQ_REMOVE(&pf_cachequeue
, frag
, frag_next
);
476 TAILQ_INSERT_HEAD(&pf_cachequeue
, frag
, frag_next
);
483 static __inline
struct pf_fragment
*
484 pf_find_fragment_by_ipv4_header(struct ip
*ip
, struct pf_frag_tree
*tree
)
486 struct pf_fragment key
;
488 return pf_find_fragment_by_key(&key
, tree
);
491 /* Removes a fragment from the fragment queue and frees the fragment */
493 pf_remove_fragment(struct pf_fragment
*frag
)
495 if (BUFFER_FRAGMENTS(frag
)) {
496 RB_REMOVE(pf_frag_tree
, &pf_frag_tree
, frag
);
497 TAILQ_REMOVE(&pf_fragqueue
, frag
, frag_next
);
498 pool_put(&pf_frag_pl
, frag
);
500 RB_REMOVE(pf_frag_tree
, &pf_cache_tree
, frag
);
501 TAILQ_REMOVE(&pf_cachequeue
, frag
, frag_next
);
502 pool_put(&pf_cache_pl
, frag
);
506 #define FR_IP_OFF(fr) ((ntohs((fr)->fr_ip->ip_off) & IP_OFFMASK) << 3)
508 pf_reassemble(struct mbuf
*m0
, struct pf_fragment
**frag
,
509 struct pf_frent
*frent
, int mff
)
511 struct mbuf
*m
= m0
, *m2
;
512 struct pf_frent
*frea
, *next
;
513 struct pf_frent
*frep
= NULL
;
514 struct ip
*ip
= frent
->fr_ip
;
515 uint32_t hlen
= ip
->ip_hl
<< 2;
516 u_int16_t off
= (ntohs(ip
->ip_off
) & IP_OFFMASK
) << 3;
517 u_int16_t ip_len
= ntohs(ip
->ip_len
) - ip
->ip_hl
* 4;
518 u_int16_t fr_max
= ip_len
+ off
;
519 uint32_t csum
, csum_flags
;
521 VERIFY(*frag
== NULL
|| BUFFER_FRAGMENTS(*frag
));
524 * Leverage partial checksum offload for IP fragments. Narrow down
525 * the scope to cover only UDP without IP options, as that is the
528 * Perform 1's complement adjustment of octets that got included/
529 * excluded in the hardware-calculated checksum value. Ignore cases
530 * where the value includes the entire IPv4 header span, as the sum
531 * for those octets would already be 0 by the time we get here; IP
532 * has already performed its header checksum validation. Also take
533 * care of any trailing bytes and subtract out their partial sum.
535 if (ip
->ip_p
== IPPROTO_UDP
&& hlen
== sizeof(struct ip
) &&
536 (m
->m_pkthdr
.csum_flags
&
537 (CSUM_DATA_VALID
| CSUM_PARTIAL
| CSUM_PSEUDO_HDR
)) ==
538 (CSUM_DATA_VALID
| CSUM_PARTIAL
)) {
539 uint32_t start
= m
->m_pkthdr
.csum_rx_start
;
540 int32_t trailer
= (m_pktlen(m
) - ntohs(ip
->ip_len
));
541 uint32_t swbytes
= (uint32_t)trailer
;
543 csum
= m
->m_pkthdr
.csum_rx_val
;
545 ASSERT(trailer
>= 0);
546 if ((start
!= 0 && start
!= hlen
) || trailer
!= 0) {
547 #if BYTE_ORDER != BIG_ENDIAN
552 #endif /* BYTE_ORDER != BIG_ENDIAN */
553 /* callee folds in sum */
554 csum
= m_adj_sum16(m
, start
, hlen
,
555 (ip
->ip_len
- hlen
), csum
);
557 swbytes
+= (hlen
- start
);
559 swbytes
+= (start
- hlen
);
561 #if BYTE_ORDER != BIG_ENDIAN
566 #endif /* BYTE_ORDER != BIG_ENDIAN */
568 csum_flags
= m
->m_pkthdr
.csum_flags
;
571 udp_in_cksum_stats(swbytes
);
581 /* Invalidate checksum */
582 m
->m_pkthdr
.csum_flags
&= ~CSUM_DATA_VALID
;
584 /* Strip off ip header */
588 /* Create a new reassembly queue for this packet */
590 *frag
= pool_get(&pf_frag_pl
, PR_NOWAIT
);
592 pf_flush_fragments();
593 *frag
= pool_get(&pf_frag_pl
, PR_NOWAIT
);
599 (*frag
)->fr_flags
= 0;
601 (*frag
)->fr_af
= AF_INET
;
602 (*frag
)->fr_srcx
.v4addr
= frent
->fr_ip
->ip_src
;
603 (*frag
)->fr_dstx
.v4addr
= frent
->fr_ip
->ip_dst
;
604 (*frag
)->fr_p
= frent
->fr_ip
->ip_p
;
605 (*frag
)->fr_id
= frent
->fr_ip
->ip_id
;
606 (*frag
)->fr_timeout
= pf_time_second();
607 if (csum_flags
!= 0) {
608 (*frag
)->fr_csum_flags
= csum_flags
;
609 (*frag
)->fr_csum
= csum
;
611 LIST_INIT(&(*frag
)->fr_queue
);
613 RB_INSERT(pf_frag_tree
, &pf_frag_tree
, *frag
);
614 TAILQ_INSERT_HEAD(&pf_fragqueue
, *frag
, frag_next
);
616 /* We do not have a previous fragment */
622 * If this fragment contains similar checksum offload info
623 * as that of the existing ones, accumulate checksum. Otherwise,
624 * invalidate checksum offload info for the entire datagram.
626 if (csum_flags
!= 0 && csum_flags
== (*frag
)->fr_csum_flags
) {
627 (*frag
)->fr_csum
+= csum
;
628 } else if ((*frag
)->fr_csum_flags
!= 0) {
629 (*frag
)->fr_csum_flags
= 0;
633 * Find a fragment after the current one:
634 * - off contains the real shifted offset.
636 LIST_FOREACH(frea
, &(*frag
)->fr_queue
, fr_next
) {
637 if (FR_IP_OFF(frea
) > off
) {
643 VERIFY(frep
!= NULL
|| frea
!= NULL
);
646 FR_IP_OFF(frep
) + ntohs(frep
->fr_ip
->ip_len
) - frep
->fr_ip
->ip_hl
*
650 precut
= FR_IP_OFF(frep
) + ntohs(frep
->fr_ip
->ip_len
) -
651 frep
->fr_ip
->ip_hl
* 4 - off
;
652 if (precut
>= ip_len
) {
655 m_adj(frent
->fr_m
, precut
);
656 DPFPRINTF(("overlap -%d\n", precut
));
657 /* Enforce 8 byte boundaries */
658 ip
->ip_off
= htons(ntohs(ip
->ip_off
) + (precut
>> 3));
659 off
= (ntohs(ip
->ip_off
) & IP_OFFMASK
) << 3;
661 ip
->ip_len
= htons(ip_len
);
664 for (; frea
!= NULL
&& ip_len
+ off
> FR_IP_OFF(frea
);
668 aftercut
= ip_len
+ off
- FR_IP_OFF(frea
);
669 DPFPRINTF(("adjust overlap %d\n", aftercut
));
670 if (aftercut
< ntohs(frea
->fr_ip
->ip_len
) - frea
->fr_ip
->ip_hl
672 frea
->fr_ip
->ip_len
=
673 htons(ntohs(frea
->fr_ip
->ip_len
) - aftercut
);
674 frea
->fr_ip
->ip_off
= htons(ntohs(frea
->fr_ip
->ip_off
) +
676 m_adj(frea
->fr_m
, aftercut
);
680 /* This fragment is completely overlapped, lose it */
681 next
= LIST_NEXT(frea
, fr_next
);
683 LIST_REMOVE(frea
, fr_next
);
684 pool_put(&pf_frent_pl
, frea
);
689 /* Update maximum data size */
690 if ((*frag
)->fr_max
< fr_max
) {
691 (*frag
)->fr_max
= fr_max
;
693 /* This is the last segment */
695 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
699 LIST_INSERT_HEAD(&(*frag
)->fr_queue
, frent
, fr_next
);
701 LIST_INSERT_AFTER(frep
, frent
, fr_next
);
704 /* Check if we are completely reassembled */
705 if (!((*frag
)->fr_flags
& PFFRAG_SEENLAST
)) {
709 /* Check if we have all the data */
711 for (frep
= LIST_FIRST(&(*frag
)->fr_queue
); frep
; frep
= next
) {
712 next
= LIST_NEXT(frep
, fr_next
);
714 off
+= ntohs(frep
->fr_ip
->ip_len
) - frep
->fr_ip
->ip_hl
* 4;
715 if (off
< (*frag
)->fr_max
&&
716 (next
== NULL
|| FR_IP_OFF(next
) != off
)) {
717 DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
718 off
, next
== NULL
? -1 : FR_IP_OFF(next
),
723 DPFPRINTF(("%d < %d?\n", off
, (*frag
)->fr_max
));
724 if (off
< (*frag
)->fr_max
) {
728 /* We have all the data */
729 frent
= LIST_FIRST(&(*frag
)->fr_queue
);
730 VERIFY(frent
!= NULL
);
731 if ((frent
->fr_ip
->ip_hl
<< 2) + off
> IP_MAXPACKET
) {
732 DPFPRINTF(("drop: too big: %d\n", off
));
733 pf_free_fragment(*frag
);
737 next
= LIST_NEXT(frent
, fr_next
);
739 /* Magic from ip_input */
745 pool_put(&pf_frent_pl
, frent
);
747 for (frent
= next
; frent
!= NULL
; frent
= next
) {
748 next
= LIST_NEXT(frent
, fr_next
);
751 pool_put(&pf_frent_pl
, frent
);
756 ip
->ip_src
= (*frag
)->fr_srcx
.v4addr
;
757 ip
->ip_dst
= (*frag
)->fr_dstx
.v4addr
;
759 if ((*frag
)->fr_csum_flags
!= 0) {
760 csum
= (*frag
)->fr_csum
;
764 m
->m_pkthdr
.csum_rx_val
= csum
;
765 m
->m_pkthdr
.csum_rx_start
= sizeof(struct ip
);
766 m
->m_pkthdr
.csum_flags
= (*frag
)->fr_csum_flags
;
767 } else if ((m
->m_pkthdr
.rcvif
->if_flags
& IFF_LOOPBACK
) ||
768 (m
->m_pkthdr
.pkt_flags
& PKTF_LOOP
)) {
769 /* loopback checksums are always OK */
770 m
->m_pkthdr
.csum_data
= 0xffff;
771 m
->m_pkthdr
.csum_flags
=
772 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
|
773 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
776 /* Remove from fragment queue */
777 pf_remove_fragment(*frag
);
780 hlen
= ip
->ip_hl
<< 2;
781 ip
->ip_len
= htons(off
+ hlen
);
785 /* some debugging cruft by sklower, below, will go away soon */
786 /* XXX this should be done elsewhere */
787 if (m
->m_flags
& M_PKTHDR
) {
789 for (m2
= m
; m2
; m2
= m2
->m_next
) {
792 m
->m_pkthdr
.len
= plen
;
795 DPFPRINTF(("complete: 0x%llx(%d)\n",
796 (uint64_t)VM_KERNEL_ADDRPERM(m
), ntohs(ip
->ip_len
)));
800 /* Oops - fail safe - drop packet */
801 pool_put(&pf_frent_pl
, frent
);
808 pf_fragcache(struct mbuf
**m0
, struct ip
*h
, struct pf_fragment
**frag
, int mff
,
809 int drop
, int *nomem
)
811 struct mbuf
*m
= *m0
;
812 struct pf_frcache
*frp
, *fra
, *cur
= NULL
;
813 int ip_len
= ntohs(h
->ip_len
) - (h
->ip_hl
<< 2);
814 u_int16_t off
= ntohs(h
->ip_off
) << 3;
815 u_int16_t fr_max
= ip_len
+ off
;
818 VERIFY(*frag
== NULL
|| !BUFFER_FRAGMENTS(*frag
));
820 /* Create a new range queue for this packet */
822 *frag
= pool_get(&pf_cache_pl
, PR_NOWAIT
);
824 pf_flush_fragments();
825 *frag
= pool_get(&pf_cache_pl
, PR_NOWAIT
);
831 /* Get an entry for the queue */
832 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
834 pool_put(&pf_cache_pl
, *frag
);
840 (*frag
)->fr_flags
= PFFRAG_NOBUFFER
;
842 (*frag
)->fr_af
= AF_INET
;
843 (*frag
)->fr_srcx
.v4addr
= h
->ip_src
;
844 (*frag
)->fr_dstx
.v4addr
= h
->ip_dst
;
845 (*frag
)->fr_p
= h
->ip_p
;
846 (*frag
)->fr_id
= h
->ip_id
;
847 (*frag
)->fr_timeout
= pf_time_second();
850 cur
->fr_end
= fr_max
;
851 LIST_INIT(&(*frag
)->fr_cache
);
852 LIST_INSERT_HEAD(&(*frag
)->fr_cache
, cur
, fr_next
);
854 RB_INSERT(pf_frag_tree
, &pf_cache_tree
, *frag
);
855 TAILQ_INSERT_HEAD(&pf_cachequeue
, *frag
, frag_next
);
857 DPFPRINTF(("fragcache[%d]: new %d-%d\n", h
->ip_id
, off
,
864 * Find a fragment after the current one:
865 * - off contains the real shifted offset.
868 LIST_FOREACH(fra
, &(*frag
)->fr_cache
, fr_next
) {
869 if (fra
->fr_off
> off
) {
875 VERIFY(frp
!= NULL
|| fra
!= NULL
);
880 precut
= frp
->fr_end
- off
;
881 if (precut
>= ip_len
) {
882 /* Fragment is entirely a duplicate */
883 DPFPRINTF(("fragcache[%d]: dead (%d-%d) %d-%d\n",
884 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
, fr_max
));
888 /* They are adjacent. Fixup cache entry */
889 DPFPRINTF(("fragcache[%d]: adjacent (%d-%d) %d-%d\n",
890 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
, fr_max
));
891 frp
->fr_end
= fr_max
;
892 } else if (precut
> 0) {
894 * The first part of this payload overlaps with a
895 * fragment that has already been passed.
896 * Need to trim off the first part of the payload.
897 * But to do so easily, we need to create another
898 * mbuf to throw the original header into.
901 DPFPRINTF(("fragcache[%d]: chop %d (%d-%d) %d-%d\n",
902 h
->ip_id
, precut
, frp
->fr_off
, frp
->fr_end
, off
,
907 /* Update the previous frag to encompass this one */
908 frp
->fr_end
= fr_max
;
912 * XXX Optimization opportunity
913 * This is a very heavy way to trim the payload.
914 * we could do it much faster by diddling mbuf
915 * internals but that would be even less legible
916 * than this mbuf magic. For my next trick,
917 * I'll pull a rabbit out of my laptop.
919 *m0
= m_copym(m
, 0, h
->ip_hl
<< 2, M_NOWAIT
);
923 VERIFY((*m0
)->m_next
== NULL
);
924 m_adj(m
, precut
+ (h
->ip_hl
<< 2));
927 if (m
->m_flags
& M_PKTHDR
) {
930 for (t
= m
; t
; t
= t
->m_next
) {
933 m
->m_pkthdr
.len
= plen
;
937 h
= mtod(m
, struct ip
*);
940 VERIFY((int)m
->m_len
==
941 ntohs(h
->ip_len
) - precut
);
942 h
->ip_off
= htons(ntohs(h
->ip_off
) +
944 h
->ip_len
= htons(ntohs(h
->ip_len
) - precut
);
949 /* There is a gap between fragments */
951 DPFPRINTF(("fragcache[%d]: gap %d (%d-%d) %d-%d\n",
952 h
->ip_id
, -precut
, frp
->fr_off
, frp
->fr_end
, off
,
955 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
962 cur
->fr_end
= fr_max
;
963 LIST_INSERT_AFTER(frp
, cur
, fr_next
);
971 aftercut
= fr_max
- fra
->fr_off
;
973 /* Adjacent fragments */
974 DPFPRINTF(("fragcache[%d]: adjacent %d-%d (%d-%d)\n",
975 h
->ip_id
, off
, fr_max
, fra
->fr_off
, fra
->fr_end
));
978 } else if (aftercut
> 0) {
979 /* Need to chop off the tail of this fragment */
980 DPFPRINTF(("fragcache[%d]: chop %d %d-%d (%d-%d)\n",
981 h
->ip_id
, aftercut
, off
, fr_max
, fra
->fr_off
,
990 if (m
->m_flags
& M_PKTHDR
) {
993 for (t
= m
; t
; t
= t
->m_next
) {
996 m
->m_pkthdr
.len
= plen
;
998 h
= mtod(m
, struct ip
*);
999 VERIFY((int)m
->m_len
==
1000 ntohs(h
->ip_len
) - aftercut
);
1001 h
->ip_len
= htons(ntohs(h
->ip_len
) - aftercut
);
1005 } else if (frp
== NULL
) {
1006 /* There is a gap between fragments */
1007 DPFPRINTF(("fragcache[%d]: gap %d %d-%d (%d-%d)\n",
1008 h
->ip_id
, -aftercut
, off
, fr_max
, fra
->fr_off
,
1011 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
1018 cur
->fr_end
= fr_max
;
1019 LIST_INSERT_BEFORE(fra
, cur
, fr_next
);
1023 /* Need to glue together two separate fragment descriptors */
1025 if (cur
&& fra
->fr_off
<= cur
->fr_end
) {
1026 /* Need to merge in a previous 'cur' */
1027 DPFPRINTF(("fragcache[%d]: adjacent(merge "
1028 "%d-%d) %d-%d (%d-%d)\n",
1029 h
->ip_id
, cur
->fr_off
, cur
->fr_end
, off
,
1030 fr_max
, fra
->fr_off
, fra
->fr_end
));
1031 fra
->fr_off
= cur
->fr_off
;
1032 LIST_REMOVE(cur
, fr_next
);
1033 pool_put(&pf_cent_pl
, cur
);
1036 } else if (frp
&& fra
->fr_off
<= frp
->fr_end
) {
1037 /* Need to merge in a modified 'frp' */
1038 VERIFY(cur
== NULL
);
1039 DPFPRINTF(("fragcache[%d]: adjacent(merge "
1040 "%d-%d) %d-%d (%d-%d)\n",
1041 h
->ip_id
, frp
->fr_off
, frp
->fr_end
, off
,
1042 fr_max
, fra
->fr_off
, fra
->fr_end
));
1043 fra
->fr_off
= frp
->fr_off
;
1044 LIST_REMOVE(frp
, fr_next
);
1045 pool_put(&pf_cent_pl
, frp
);
1054 * We must keep tracking the overall fragment even when
1055 * we're going to drop it anyway so that we know when to
1056 * free the overall descriptor. Thus we drop the frag late.
1063 /* Update maximum data size */
1064 if ((*frag
)->fr_max
< fr_max
) {
1065 (*frag
)->fr_max
= fr_max
;
1068 /* This is the last segment */
1070 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1073 /* Check if we are completely reassembled */
1074 if (((*frag
)->fr_flags
& PFFRAG_SEENLAST
) &&
1075 LIST_FIRST(&(*frag
)->fr_cache
)->fr_off
== 0 &&
1076 LIST_FIRST(&(*frag
)->fr_cache
)->fr_end
== (*frag
)->fr_max
) {
1077 /* Remove from fragment queue */
1078 DPFPRINTF(("fragcache[%d]: done 0-%d\n", h
->ip_id
,
1080 pf_free_fragment(*frag
);
1089 /* Still need to pay attention to !IP_MF */
1090 if (!mff
&& *frag
!= NULL
) {
1091 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1099 /* Still need to pay attention to !IP_MF */
1100 if (!mff
&& *frag
!= NULL
) {
1101 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1105 /* This fragment has been deemed bad. Don't reass */
1106 if (((*frag
)->fr_flags
& PFFRAG_DROP
) == 0) {
1107 DPFPRINTF(("fragcache[%d]: dropping overall fragment\n",
1110 (*frag
)->fr_flags
|= PFFRAG_DROP
;
1118 #define FR_IP6_OFF(fr) \
1119 (ntohs((fr)->fr_ip6f_opt.ip6f_offlg & IP6F_OFF_MASK))
1120 #define FR_IP6_PLEN(fr) (ntohs((fr)->fr_ip6->ip6_plen))
1122 pf_reassemble6(struct mbuf
**m0
, struct pf_fragment
**frag
,
1123 struct pf_frent
*frent
, int mff
)
1125 struct mbuf
*m
, *m2
;
1126 struct pf_frent
*frea
, *frep
, *next
;
1127 struct ip6_hdr
*ip6
;
1128 struct ip6_frag
*ip6f
;
1129 int plen
, off
, fr_max
, pktlen
;
1130 uint32_t uoff
, csum
, csum_flags
;
1132 VERIFY(*frag
== NULL
|| BUFFER_FRAGMENTS(*frag
));
1135 ip6
= frent
->fr_ip6
;
1136 ip6f
= &frent
->fr_ip6f_opt
;
1137 off
= FR_IP6_OFF(frent
);
1138 uoff
= frent
->fr_ip6f_hlen
;
1139 plen
= FR_IP6_PLEN(frent
);
1140 fr_max
= off
+ plen
- (frent
->fr_ip6f_hlen
- sizeof(*ip6
));
1141 pktlen
= plen
+ sizeof(*ip6
);
1143 DPFPRINTF(("0x%llx IPv6 frag plen %u off %u fr_ip6f_hlen %u "
1144 "fr_max %u m_len %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m
), plen
, off
,
1145 frent
->fr_ip6f_hlen
, fr_max
, m
->m_len
));
1148 * Leverage partial checksum offload for simple UDP/IP fragments,
1149 * as that is the most common case.
1151 * Perform 1's complement adjustment of octets that got included/
1152 * excluded in the hardware-calculated checksum value. Also take
1153 * care of any trailing bytes and subtract out their partial sum.
1155 if (ip6f
->ip6f_nxt
== IPPROTO_UDP
&&
1156 uoff
== (sizeof(*ip6
) + sizeof(*ip6f
)) &&
1157 (m
->m_pkthdr
.csum_flags
&
1158 (CSUM_DATA_VALID
| CSUM_PARTIAL
| CSUM_PSEUDO_HDR
)) ==
1159 (CSUM_DATA_VALID
| CSUM_PARTIAL
)) {
1160 uint32_t start
= m
->m_pkthdr
.csum_rx_start
;
1161 uint32_t ip_len
= (sizeof(*ip6
) + ntohs(ip6
->ip6_plen
));
1162 int32_t trailer
= (m_pktlen(m
) - ip_len
);
1163 uint32_t swbytes
= (uint32_t)trailer
;
1165 csum
= m
->m_pkthdr
.csum_rx_val
;
1167 ASSERT(trailer
>= 0);
1168 if (start
!= uoff
|| trailer
!= 0) {
1169 uint16_t s
= 0, d
= 0;
1171 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_src
)) {
1172 s
= ip6
->ip6_src
.s6_addr16
[1];
1173 ip6
->ip6_src
.s6_addr16
[1] = 0;
1175 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_dst
)) {
1176 d
= ip6
->ip6_dst
.s6_addr16
[1];
1177 ip6
->ip6_dst
.s6_addr16
[1] = 0;
1180 /* callee folds in sum */
1181 csum
= m_adj_sum16(m
, start
, uoff
,
1182 (ip_len
- uoff
), csum
);
1184 swbytes
+= (uoff
- start
);
1186 swbytes
+= (start
- uoff
);
1189 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_src
)) {
1190 ip6
->ip6_src
.s6_addr16
[1] = s
;
1192 if (IN6_IS_SCOPE_EMBED(&ip6
->ip6_dst
)) {
1193 ip6
->ip6_dst
.s6_addr16
[1] = d
;
1196 csum_flags
= m
->m_pkthdr
.csum_flags
;
1199 udp_in6_cksum_stats(swbytes
);
1209 /* Invalidate checksum */
1210 m
->m_pkthdr
.csum_flags
&= ~CSUM_DATA_VALID
;
1212 /* strip off headers up to the fragment payload */
1213 m
->m_data
+= frent
->fr_ip6f_hlen
;
1214 m
->m_len
-= frent
->fr_ip6f_hlen
;
1216 /* Create a new reassembly queue for this packet */
1217 if (*frag
== NULL
) {
1218 *frag
= pool_get(&pf_frag_pl
, PR_NOWAIT
);
1219 if (*frag
== NULL
) {
1220 pf_flush_fragments();
1221 *frag
= pool_get(&pf_frag_pl
, PR_NOWAIT
);
1222 if (*frag
== NULL
) {
1227 (*frag
)->fr_flags
= 0;
1228 (*frag
)->fr_max
= 0;
1229 (*frag
)->fr_ip6_maxlen
= pktlen
;
1230 (*frag
)->fr_af
= AF_INET6
;
1231 (*frag
)->fr_srcx
.v6addr
= frent
->fr_ip6
->ip6_src
;
1232 (*frag
)->fr_dstx
.v6addr
= frent
->fr_ip6
->ip6_dst
;
1233 (*frag
)->fr_p
= frent
->fr_ip6f_opt
.ip6f_nxt
;
1234 (*frag
)->fr_id6
= frent
->fr_ip6f_opt
.ip6f_ident
;
1235 (*frag
)->fr_timeout
= pf_time_second();
1236 if (csum_flags
!= 0) {
1237 (*frag
)->fr_csum_flags
= csum_flags
;
1238 (*frag
)->fr_csum
= csum
;
1240 LIST_INIT(&(*frag
)->fr_queue
);
1242 RB_INSERT(pf_frag_tree
, &pf_frag_tree
, *frag
);
1243 TAILQ_INSERT_HEAD(&pf_fragqueue
, *frag
, frag_next
);
1245 /* We do not have a previous fragment */
1250 /* Remember maximum fragment len for refragmentation */
1251 if (pktlen
> (*frag
)->fr_ip6_maxlen
) {
1252 (*frag
)->fr_ip6_maxlen
= pktlen
;
1255 * If this fragment contains similar checksum offload info
1256 * as that of the existing ones, accumulate checksum. Otherwise,
1257 * invalidate checksum offload info for the entire datagram.
1259 if (csum_flags
!= 0 && csum_flags
== (*frag
)->fr_csum_flags
) {
1260 (*frag
)->fr_csum
+= csum
;
1261 } else if ((*frag
)->fr_csum_flags
!= 0) {
1262 (*frag
)->fr_csum_flags
= 0;
1266 * Find a fragment after the current one:
1267 * - off contains the real shifted offset.
1269 LIST_FOREACH(frea
, &(*frag
)->fr_queue
, fr_next
) {
1270 if (FR_IP6_OFF(frea
) > off
) {
1276 VERIFY(frep
!= NULL
|| frea
!= NULL
);
1279 FR_IP6_OFF(frep
) + FR_IP6_PLEN(frep
) - frep
->fr_ip6f_hlen
> off
) {
1282 precut
= FR_IP6_OFF(frep
) + FR_IP6_PLEN(frep
) -
1283 frep
->fr_ip6f_hlen
- off
;
1284 if (precut
>= plen
) {
1287 m_adj(frent
->fr_m
, precut
);
1288 DPFPRINTF(("overlap -%d\n", precut
));
1289 /* Enforce 8 byte boundaries */
1290 frent
->fr_ip6f_opt
.ip6f_offlg
=
1291 htons(ntohs(frent
->fr_ip6f_opt
.ip6f_offlg
) +
1293 off
= FR_IP6_OFF(frent
);
1295 ip6
->ip6_plen
= htons(plen
);
1298 for (; frea
!= NULL
&& plen
+ off
> FR_IP6_OFF(frea
); frea
= next
) {
1301 aftercut
= plen
+ off
- FR_IP6_OFF(frea
);
1302 DPFPRINTF(("adjust overlap %d\n", aftercut
));
1303 if (aftercut
< FR_IP6_PLEN(frea
) - frea
->fr_ip6f_hlen
) {
1304 frea
->fr_ip6
->ip6_plen
= htons(FR_IP6_PLEN(frea
) -
1306 frea
->fr_ip6f_opt
.ip6f_offlg
=
1307 htons(ntohs(frea
->fr_ip6f_opt
.ip6f_offlg
) +
1309 m_adj(frea
->fr_m
, aftercut
);
1313 /* This fragment is completely overlapped, lose it */
1314 next
= LIST_NEXT(frea
, fr_next
);
1315 m_freem(frea
->fr_m
);
1316 LIST_REMOVE(frea
, fr_next
);
1317 pool_put(&pf_frent_pl
, frea
);
1322 /* Update maximum data size */
1323 if ((*frag
)->fr_max
< fr_max
) {
1324 (*frag
)->fr_max
= fr_max
;
1326 /* This is the last segment */
1328 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1332 LIST_INSERT_HEAD(&(*frag
)->fr_queue
, frent
, fr_next
);
1334 LIST_INSERT_AFTER(frep
, frent
, fr_next
);
1337 /* Check if we are completely reassembled */
1338 if (!((*frag
)->fr_flags
& PFFRAG_SEENLAST
)) {
1342 /* Check if we have all the data */
1344 for (frep
= LIST_FIRST(&(*frag
)->fr_queue
); frep
; frep
= next
) {
1345 next
= LIST_NEXT(frep
, fr_next
);
1346 off
+= FR_IP6_PLEN(frep
) - (frent
->fr_ip6f_hlen
- sizeof *ip6
);
1347 DPFPRINTF(("frep at %d, next %d, max %d\n",
1348 off
, next
== NULL
? -1 : FR_IP6_OFF(next
),
1350 if (off
< (*frag
)->fr_max
&&
1351 (next
== NULL
|| FR_IP6_OFF(next
) != off
)) {
1352 DPFPRINTF(("missing fragment at %d, next %d, max %d\n",
1353 off
, next
== NULL
? -1 : FR_IP6_OFF(next
),
1358 DPFPRINTF(("%d < %d?\n", off
, (*frag
)->fr_max
));
1359 if (off
< (*frag
)->fr_max
) {
1363 /* We have all the data */
1364 frent
= LIST_FIRST(&(*frag
)->fr_queue
);
1365 VERIFY(frent
!= NULL
);
1366 if (frent
->fr_ip6f_hlen
+ off
> IP_MAXPACKET
) {
1367 DPFPRINTF(("drop: too big: %d\n", off
));
1368 pf_free_fragment(*frag
);
1373 ASSERT(*frag
!= NULL
);
1374 ASSERT(frent
!= NULL
);
1375 next
= LIST_NEXT(frent
, fr_next
);
1377 DPFPRINTF(("drop: atomic fragment\n"));
1378 pf_free_fragment(*frag
);
1383 /* retrieve the values to be filled in to reassembled tag */
1384 uint16_t hdrlen
, unfragpartlen
, extoff
, maxlen
;
1387 /* Get total extension header length from the first fragment */
1388 hdrlen
= frent
->fr_ip6f_hlen
- sizeof(struct ip6_frag
);
1390 * Get total extension header length of per-fragment headers from the
1391 * subsequent fragment.
1393 unfragpartlen
= next
->fr_ip6f_hlen
- sizeof(struct ip6_frag
);
1394 extoff
= frent
->fr_ip6f_extoff
;
1395 maxlen
= (*frag
)->fr_ip6_maxlen
;
1396 id
= (*frag
)->fr_id6
;
1398 ip6
= frent
->fr_ip6
;
1399 ip6
->ip6_nxt
= (*frag
)->fr_p
;
1400 ip6
->ip6_plen
= htons(off
);
1401 ip6
->ip6_src
= (*frag
)->fr_srcx
.v6addr
;
1402 ip6
->ip6_dst
= (*frag
)->fr_dstx
.v6addr
;
1404 if ((*frag
)->fr_csum_flags
!= 0) {
1405 csum
= (*frag
)->fr_csum
;
1409 m
->m_pkthdr
.csum_rx_val
= csum
;
1410 m
->m_pkthdr
.csum_rx_start
= sizeof(struct ip6_hdr
);
1411 m
->m_pkthdr
.csum_flags
= (*frag
)->fr_csum_flags
;
1412 } else if ((m
->m_pkthdr
.rcvif
->if_flags
& IFF_LOOPBACK
) ||
1413 (m
->m_pkthdr
.pkt_flags
& PKTF_LOOP
)) {
1414 /* loopback checksums are always OK */
1415 m
->m_pkthdr
.csum_data
= 0xffff;
1416 m
->m_pkthdr
.csum_flags
= CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
1419 /* Remove from fragment queue */
1420 pf_remove_fragment(*frag
);
1424 m
->m_len
+= sizeof(struct ip6_hdr
);
1425 m
->m_data
-= sizeof(struct ip6_hdr
);
1426 memmove(m
->m_data
, ip6
, sizeof(struct ip6_hdr
));
1428 next
= LIST_NEXT(frent
, fr_next
);
1429 pool_put(&pf_frent_pl
, frent
);
1431 for (frent
= next
; next
!= NULL
; frent
= next
) {
1435 next
= LIST_NEXT(frent
, fr_next
);
1436 pool_put(&pf_frent_pl
, frent
);
1440 /* XXX this should be done elsewhere */
1441 if (m
->m_flags
& M_PKTHDR
) {
1443 for (m2
= m
; m2
; m2
= m2
->m_next
) {
1446 m
->m_pkthdr
.len
= len
;
1449 DPFPRINTF(("complete: 0x%llx ip6_plen %d m_pkthdr.len %d\n",
1450 (uint64_t)VM_KERNEL_ADDRPERM(m
), ntohs(ip6
->ip6_plen
),
1453 /* Add the reassembled tag */
1455 struct pf_fragment_tag
*ftag
;
1456 mtag
= m_tag_create(KERNEL_MODULE_TAG_ID
, KERNEL_TAG_TYPE_PF_REASS
,
1457 sizeof(*ftag
), M_NOWAIT
, m
);
1459 /* XXX: add stats */
1463 ftag
= (struct pf_fragment_tag
*)(mtag
+ 1);
1464 ftag
->ft_hdrlen
= hdrlen
;
1465 ftag
->ft_unfragpartlen
= unfragpartlen
;
1466 ftag
->ft_extoff
= extoff
;
1467 ftag
->ft_maxlen
= maxlen
;
1469 m_tag_prepend(m
, mtag
);
1471 struct pf_mtag
*pftag
= pf_get_mtag(m
);
1472 ASSERT(pftag
!= NULL
);
1473 pftag
->pftag_flags
|= PF_TAG_REASSEMBLED
;
1477 /* Oops - fail safe - drop packet */
1478 pool_put(&pf_frent_pl
, frent
);
1484 static struct mbuf
*
1485 pf_frag6cache(struct mbuf
**m0
, struct ip6_hdr
*h
, struct ip6_frag
*fh
,
1486 struct pf_fragment
**frag
, int hlen
, int mff
, int drop
, int *nomem
)
1488 struct mbuf
*m
= *m0
;
1489 u_int16_t plen
, off
, fr_max
;
1490 struct pf_frcache
*frp
, *fra
, *cur
= NULL
;
1493 VERIFY(*frag
== NULL
|| !BUFFER_FRAGMENTS(*frag
));
1495 off
= ntohs(fh
->ip6f_offlg
& IP6F_OFF_MASK
);
1496 plen
= ntohs(h
->ip6_plen
) - (hlen
- sizeof *h
);
1499 * Apple Modification: dimambro@apple.com. The hlen, being passed
1500 * into this function Includes all the headers associated with
1501 * the packet, and may include routing headers, so to get to
1502 * the data payload as stored in the original IPv6 header we need
1503 * to subtract al those headers and the IP header.
1505 * The 'max' local variable should also contain the offset from the start
1506 * of the reassembled packet to the octet just past the end of the octets
1507 * in the current fragment where:
1508 * - 'off' is the offset from the start of the reassembled packet to the
1509 * first octet in the fragment,
1510 * - 'plen' is the length of the "payload data length" Excluding all the
1511 * IPv6 headers of the fragment.
1512 * - 'hlen' is computed in pf_normalize_ip6() as the offset from the start
1513 * of the IPv6 packet to the beginning of the data.
1515 fr_max
= off
+ plen
;
1517 DPFPRINTF(("0x%llx plen %u off %u fr_max %u\n",
1518 (uint64_t)VM_KERNEL_ADDRPERM(m
), plen
, off
, fr_max
));
1520 /* Create a new range queue for this packet */
1521 if (*frag
== NULL
) {
1522 *frag
= pool_get(&pf_cache_pl
, PR_NOWAIT
);
1523 if (*frag
== NULL
) {
1524 pf_flush_fragments();
1525 *frag
= pool_get(&pf_cache_pl
, PR_NOWAIT
);
1526 if (*frag
== NULL
) {
1531 /* Get an entry for the queue */
1532 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
1534 pool_put(&pf_cache_pl
, *frag
);
1540 (*frag
)->fr_flags
= PFFRAG_NOBUFFER
;
1541 (*frag
)->fr_max
= 0;
1542 (*frag
)->fr_af
= AF_INET6
;
1543 (*frag
)->fr_srcx
.v6addr
= h
->ip6_src
;
1544 (*frag
)->fr_dstx
.v6addr
= h
->ip6_dst
;
1545 (*frag
)->fr_p
= fh
->ip6f_nxt
;
1546 (*frag
)->fr_id6
= fh
->ip6f_ident
;
1547 (*frag
)->fr_timeout
= pf_time_second();
1550 cur
->fr_end
= fr_max
;
1551 LIST_INIT(&(*frag
)->fr_cache
);
1552 LIST_INSERT_HEAD(&(*frag
)->fr_cache
, cur
, fr_next
);
1554 RB_INSERT(pf_frag_tree
, &pf_cache_tree
, *frag
);
1555 TAILQ_INSERT_HEAD(&pf_cachequeue
, *frag
, frag_next
);
1557 DPFPRINTF(("frag6cache[%d]: new %d-%d\n", ntohl(fh
->ip6f_ident
),
1564 * Find a fragment after the current one:
1565 * - off contains the real shifted offset.
1568 LIST_FOREACH(fra
, &(*frag
)->fr_cache
, fr_next
) {
1569 if (fra
->fr_off
> off
) {
1575 VERIFY(frp
!= NULL
|| fra
!= NULL
);
1580 precut
= frp
->fr_end
- off
;
1581 if (precut
>= plen
) {
1582 /* Fragment is entirely a duplicate */
1583 DPFPRINTF(("frag6cache[%u]: dead (%d-%d) %d-%d\n",
1584 ntohl(fh
->ip6f_ident
), frp
->fr_off
, frp
->fr_end
,
1589 /* They are adjacent. Fixup cache entry */
1590 DPFPRINTF(("frag6cache[%u]: adjacent (%d-%d) %d-%d\n",
1591 ntohl(fh
->ip6f_ident
), frp
->fr_off
, frp
->fr_end
,
1593 frp
->fr_end
= fr_max
;
1594 } else if (precut
> 0) {
1595 /* The first part of this payload overlaps with a
1596 * fragment that has already been passed.
1597 * Need to trim off the first part of the payload.
1598 * But to do so easily, we need to create another
1599 * mbuf to throw the original header into.
1602 DPFPRINTF(("frag6cache[%u]: chop %d (%d-%d) %d-%d\n",
1603 ntohl(fh
->ip6f_ident
), precut
, frp
->fr_off
,
1604 frp
->fr_end
, off
, fr_max
));
1608 /* Update the previous frag to encompass this one */
1609 frp
->fr_end
= fr_max
;
1612 /* XXX Optimization opportunity
1613 * This is a very heavy way to trim the payload.
1614 * we could do it much faster by diddling mbuf
1615 * internals but that would be even less legible
1616 * than this mbuf magic. For my next trick,
1617 * I'll pull a rabbit out of my laptop.
1619 *m0
= m_copym(m
, 0, hlen
, M_NOWAIT
);
1623 VERIFY((*m0
)->m_next
== NULL
);
1624 m_adj(m
, precut
+ hlen
);
1627 if (m
->m_flags
& M_PKTHDR
) {
1630 for (t
= m
; t
; t
= t
->m_next
) {
1633 m
->m_pkthdr
.len
= pktlen
;
1636 h
= mtod(m
, struct ip6_hdr
*);
1638 VERIFY((int)m
->m_len
==
1639 ntohs(h
->ip6_plen
) - precut
);
1640 fh
->ip6f_offlg
&= ~IP6F_OFF_MASK
;
1642 htons(ntohs(fh
->ip6f_offlg
& IP6F_OFF_MASK
)
1644 h
->ip6_plen
= htons(ntohs(h
->ip6_plen
) -
1650 /* There is a gap between fragments */
1652 DPFPRINTF(("frag6cache[%u]: gap %d (%d-%d) %d-%d\n",
1653 ntohl(fh
->ip6f_ident
), -precut
, frp
->fr_off
,
1654 frp
->fr_end
, off
, fr_max
));
1656 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
1663 cur
->fr_end
= fr_max
;
1664 LIST_INSERT_AFTER(frp
, cur
, fr_next
);
1672 aftercut
= fr_max
- fra
->fr_off
;
1673 if (aftercut
== 0) {
1674 /* Adjacent fragments */
1675 DPFPRINTF(("frag6cache[%u]: adjacent %d-%d (%d-%d)\n",
1676 ntohl(fh
->ip6f_ident
), off
, fr_max
, fra
->fr_off
,
1680 } else if (aftercut
> 0) {
1681 /* Need to chop off the tail of this fragment */
1682 DPFPRINTF(("frag6cache[%u]: chop %d %d-%d (%d-%d)\n",
1683 ntohl(fh
->ip6f_ident
), aftercut
, off
, fr_max
,
1684 fra
->fr_off
, fra
->fr_end
));
1691 m_adj(m
, -aftercut
);
1692 if (m
->m_flags
& M_PKTHDR
) {
1695 for (t
= m
; t
; t
= t
->m_next
) {
1698 m
->m_pkthdr
.len
= pktlen
;
1700 h
= mtod(m
, struct ip6_hdr
*);
1701 VERIFY((int)m
->m_len
==
1702 ntohs(h
->ip6_plen
) - aftercut
);
1704 htons(ntohs(h
->ip6_plen
) - aftercut
);
1708 } else if (frp
== NULL
) {
1709 /* There is a gap between fragments */
1710 DPFPRINTF(("frag6cache[%u]: gap %d %d-%d (%d-%d)\n",
1711 ntohl(fh
->ip6f_ident
), -aftercut
, off
, fr_max
,
1712 fra
->fr_off
, fra
->fr_end
));
1714 cur
= pool_get(&pf_cent_pl
, PR_NOWAIT
);
1721 cur
->fr_end
= fr_max
;
1722 LIST_INSERT_BEFORE(fra
, cur
, fr_next
);
1725 /* Need to glue together two separate fragment descriptors */
1727 if (cur
&& fra
->fr_off
<= cur
->fr_end
) {
1728 /* Need to merge in a previous 'cur' */
1729 DPFPRINTF(("frag6cache[%u]: adjacent(merge "
1730 "%d-%d) %d-%d (%d-%d)\n",
1731 ntohl(fh
->ip6f_ident
), cur
->fr_off
,
1732 cur
->fr_end
, off
, fr_max
, fra
->fr_off
,
1734 fra
->fr_off
= cur
->fr_off
;
1735 LIST_REMOVE(cur
, fr_next
);
1736 pool_put(&pf_cent_pl
, cur
);
1739 } else if (frp
&& fra
->fr_off
<= frp
->fr_end
) {
1740 /* Need to merge in a modified 'frp' */
1741 VERIFY(cur
== NULL
);
1742 DPFPRINTF(("frag6cache[%u]: adjacent(merge "
1743 "%d-%d) %d-%d (%d-%d)\n",
1744 ntohl(fh
->ip6f_ident
), frp
->fr_off
,
1745 frp
->fr_end
, off
, fr_max
, fra
->fr_off
,
1747 fra
->fr_off
= frp
->fr_off
;
1748 LIST_REMOVE(frp
, fr_next
);
1749 pool_put(&pf_cent_pl
, frp
);
1758 * We must keep tracking the overall fragment even when
1759 * we're going to drop it anyway so that we know when to
1760 * free the overall descriptor. Thus we drop the frag late.
1766 /* Update maximum data size */
1767 if ((*frag
)->fr_max
< fr_max
) {
1768 (*frag
)->fr_max
= fr_max
;
1771 /* This is the last segment */
1773 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1776 /* Check if we are completely reassembled */
1777 if (((*frag
)->fr_flags
& PFFRAG_SEENLAST
) &&
1778 LIST_FIRST(&(*frag
)->fr_cache
)->fr_off
== 0 &&
1779 LIST_FIRST(&(*frag
)->fr_cache
)->fr_end
== (*frag
)->fr_max
) {
1780 /* Remove from fragment queue */
1781 DPFPRINTF(("frag6cache[%u]: done 0-%d\n",
1782 ntohl(fh
->ip6f_ident
), (*frag
)->fr_max
));
1783 pf_free_fragment(*frag
);
1792 /* Still need to pay attention to !IP_MF */
1793 if (!mff
&& *frag
!= NULL
) {
1794 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1802 /* Still need to pay attention to !IP_MF */
1803 if (!mff
&& *frag
!= NULL
) {
1804 (*frag
)->fr_flags
|= PFFRAG_SEENLAST
;
1808 /* This fragment has been deemed bad. Don't reass */
1809 if (((*frag
)->fr_flags
& PFFRAG_DROP
) == 0) {
1810 DPFPRINTF(("frag6cache[%u]: dropping overall fragment\n",
1811 ntohl(fh
->ip6f_ident
)));
1813 (*frag
)->fr_flags
|= PFFRAG_DROP
;
1821 pf_refragment6(struct ifnet
*ifp
, pbuf_t
**pbufp
, struct pf_fragment_tag
*ftag
)
1825 uint16_t hdrlen
, extoff
, maxlen
, unfragpartlen
;
1829 struct route_in6 ip6route
;
1830 struct route_in6
*ro
;
1831 struct sockaddr_in6
*dst
;
1832 struct ip6_hdr
*hdr
;
1833 struct pf_mtag
*mtag
;
1836 if (pbufp
== NULL
|| !pbuf_is_valid(*pbufp
) || ftag
== NULL
) {
1837 panic("pf_route6: invalid parameters");
1840 m
= pbuf_to_mbuf(*pbufp
, FALSE
);
1841 hdr
= mtod(m
, struct ip6_hdr
*);
1842 mtag
= pf_find_mtag(m
);
1843 hdrlen
= ftag
->ft_hdrlen
- sizeof(struct ip6_hdr
);
1844 extoff
= ftag
->ft_extoff
;
1845 maxlen
= ftag
->ft_maxlen
;
1846 frag_id
= ftag
->ft_id
;
1847 unfragpartlen
= ftag
->ft_unfragpartlen
;
1848 tag
= (struct m_tag
*)(void *)ftag
;
1850 m_tag_delete(m
, tag
);
1853 mtag
->pftag_flags
&= ~PF_TAG_REASSEMBLED
;
1855 bzero((caddr_t
)ro
, sizeof(*ro
));
1856 dst
= (struct sockaddr_in6
*)&ro
->ro_dst
;
1857 dst
->sin6_family
= AF_INET6
;
1858 dst
->sin6_len
= sizeof(*dst
);
1859 dst
->sin6_addr
= hdr
->ip6_dst
;
1863 struct mbuf
*mexthdr
;
1865 /* Use protocol from next field of last extension header */
1866 mexthdr
= m_getptr(m
, extoff
+
1867 offsetof(struct ip6_ext
, ip6e_nxt
), &off
);
1868 ASSERT(mexthdr
!= NULL
);
1869 lexthdrsp
= (mtod(mexthdr
, uint8_t *) + off
);
1871 if (proto
== IPPROTO_DSTOPTS
) {
1873 if (!pf_pull_hdr(*pbufp
, off
, &ext
, sizeof(ext
), NULL
,
1875 DPFPRINTF(("pkt too short"));
1879 proto
= ext
.ip6e_nxt
;
1883 proto
= hdr
->ip6_nxt
;
1887 * The MTU must be a multiple of 8 bytes, or we risk doing the
1888 * fragmentation wrong.
1890 maxlen
= maxlen
& ~7;
1892 error
= ip6_do_fragmentation(&m
, hdrlen
, NULL
, unfragpartlen
,
1893 hdr
, lexthdrsp
, maxlen
, proto
, frag_id
);
1897 * PF_TAG_REFRAGMENTED flag set to indicate ip6_forward()
1898 * and pf_route6() that the mbuf contains a chain of fragments.
1900 mtag
->pftag_flags
|= PF_TAG_REFRAGMENTED
;
1902 pbuf_init_mbuf(*pbufp
, m
, ifp
);
1904 DPFPRINTF(("refragment error %d", error
));
1914 pf_normalize_ip(pbuf_t
*pbuf
, int dir
, struct pfi_kif
*kif
, u_short
*reason
,
1915 struct pf_pdesc
*pd
)
1919 struct pf_frent
*frent
;
1920 struct pf_fragment
*frag
= NULL
;
1921 struct ip
*h
= pbuf
->pb_data
;
1922 int mff
= (ntohs(h
->ip_off
) & IP_MF
);
1923 int hlen
= h
->ip_hl
<< 2;
1924 u_int16_t fragoff
= (ntohs(h
->ip_off
) & IP_OFFMASK
) << 3;
1929 struct pf_ruleset
*ruleset
= NULL
;
1930 struct ifnet
*ifp
= pbuf
->pb_ifp
;
1932 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
1935 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
) {
1936 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
1937 } else if (r
->direction
&& r
->direction
!= dir
) {
1938 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
1939 } else if (r
->af
&& r
->af
!= AF_INET
) {
1940 r
= r
->skip
[PF_SKIP_AF
].ptr
;
1941 } else if (r
->proto
&& r
->proto
!= h
->ip_p
) {
1942 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
1943 } else if (PF_MISMATCHAW(&r
->src
.addr
,
1944 (struct pf_addr
*)&h
->ip_src
.s_addr
, AF_INET
,
1946 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
1947 } else if (PF_MISMATCHAW(&r
->dst
.addr
,
1948 (struct pf_addr
*)&h
->ip_dst
.s_addr
, AF_INET
,
1949 r
->dst
.neg
, NULL
)) {
1950 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
1952 if (r
->anchor
== NULL
) {
1955 pf_step_into_anchor(&asd
, &ruleset
,
1956 PF_RULESET_SCRUB
, &r
, NULL
, NULL
);
1959 if (r
== NULL
&& pf_step_out_of_anchor(&asd
, &ruleset
,
1960 PF_RULESET_SCRUB
, &r
, NULL
, NULL
)) {
1965 if (r
== NULL
|| r
->action
== PF_NOSCRUB
) {
1968 r
->packets
[dir
== PF_OUT
]++;
1969 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
1972 /* Check for illegal packets */
1973 if (hlen
< (int)sizeof(struct ip
)) {
1977 if (hlen
> ntohs(h
->ip_len
)) {
1981 /* Clear IP_DF if the rule uses the no-df option */
1982 if (r
->rule_flag
& PFRULE_NODF
&& h
->ip_off
& htons(IP_DF
)) {
1983 u_int16_t ipoff
= h
->ip_off
;
1985 h
->ip_off
&= htons(~IP_DF
);
1986 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ipoff
, h
->ip_off
, 0);
1989 /* We will need other tests here */
1990 if (!fragoff
&& !mff
) {
1995 * We're dealing with a fragment now. Don't allow fragments
1996 * with IP_DF to enter the cache. If the flag was cleared by
1997 * no-df above, fine. Otherwise drop it.
1999 if (h
->ip_off
& htons(IP_DF
)) {
2000 DPFPRINTF(("IP_DF\n"));
2004 ip_len
= ntohs(h
->ip_len
) - hlen
;
2005 ip_off
= (ntohs(h
->ip_off
) & IP_OFFMASK
) << 3;
2007 /* All fragments are 8 byte aligned */
2008 if (mff
&& (ip_len
& 0x7)) {
2009 DPFPRINTF(("mff and %d\n", ip_len
));
2013 /* Respect maximum length */
2014 if (fragoff
+ ip_len
> IP_MAXPACKET
) {
2015 DPFPRINTF(("max packet %d\n", fragoff
+ ip_len
));
2018 fr_max
= fragoff
+ ip_len
;
2020 if ((r
->rule_flag
& (PFRULE_FRAGCROP
| PFRULE_FRAGDROP
)) == 0) {
2021 /* Fully buffer all of the fragments */
2023 frag
= pf_find_fragment_by_ipv4_header(h
, &pf_frag_tree
);
2024 /* Check if we saw the last fragment already */
2025 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_SEENLAST
) &&
2026 fr_max
> frag
->fr_max
) {
2030 if ((m
= pbuf_to_mbuf(pbuf
, TRUE
)) == NULL
) {
2031 REASON_SET(reason
, PFRES_MEMORY
);
2035 VERIFY(!pbuf_is_valid(pbuf
));
2037 /* Restore iph pointer after pbuf_to_mbuf() */
2038 h
= mtod(m
, struct ip
*);
2040 /* Get an entry for the fragment queue */
2041 frent
= pool_get(&pf_frent_pl
, PR_NOWAIT
);
2042 if (frent
== NULL
) {
2043 REASON_SET(reason
, PFRES_MEMORY
);
2051 /* Might return a completely reassembled mbuf, or NULL */
2052 DPFPRINTF(("reass IPv4 frag %d @ %d-%d\n", ntohs(h
->ip_id
),
2054 m
= pf_reassemble(m
, &frag
, frent
, mff
);
2060 VERIFY(m
->m_flags
& M_PKTHDR
);
2061 pbuf_init_mbuf(pbuf
, m
, ifp
);
2063 /* use mtag from concatenated mbuf chain */
2064 pd
->pf_mtag
= pf_find_mtag_pbuf(pbuf
);
2066 // SCW: This check is superfluous
2068 if (pd
->pf_mtag
== NULL
) {
2069 printf("%s: pf_find_mtag returned NULL(1)\n", __func__
);
2070 if ((pd
->pf_mtag
= pf_get_mtag(m
)) == NULL
) {
2079 h
= mtod(m
, struct ip
*);
2081 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_DROP
)) {
2085 /* non-buffering fragment cache (drops or masks overlaps) */
2088 if (dir
== PF_OUT
&& (pd
->pf_mtag
->pftag_flags
& PF_TAG_FRAGCACHE
)) {
2090 * Already passed the fragment cache in the
2091 * input direction. If we continued, it would
2092 * appear to be a dup and would be dropped.
2097 frag
= pf_find_fragment_by_ipv4_header(h
, &pf_cache_tree
);
2099 /* Check if we saw the last fragment already */
2100 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_SEENLAST
) &&
2101 fr_max
> frag
->fr_max
) {
2102 if (r
->rule_flag
& PFRULE_FRAGDROP
) {
2103 frag
->fr_flags
|= PFFRAG_DROP
;
2108 if ((m
= pbuf_to_mbuf(pbuf
, TRUE
)) == NULL
) {
2109 REASON_SET(reason
, PFRES_MEMORY
);
2113 VERIFY(!pbuf_is_valid(pbuf
));
2115 /* Restore iph pointer after pbuf_to_mbuf() */
2116 h
= mtod(m
, struct ip
*);
2118 m
= pf_fragcache(&m
, h
, &frag
, mff
,
2119 (r
->rule_flag
& PFRULE_FRAGDROP
) ? 1 : 0, &nomem
);
2121 // Note: pf_fragcache() has already m_freem'd the mbuf
2128 VERIFY(m
->m_flags
& M_PKTHDR
);
2129 pbuf_init_mbuf(pbuf
, m
, ifp
);
2131 /* use mtag from copied and trimmed mbuf chain */
2132 pd
->pf_mtag
= pf_find_mtag_pbuf(pbuf
);
2134 // SCW: This check is superfluous
2136 if (pd
->pf_mtag
== NULL
) {
2137 printf("%s: pf_find_mtag returned NULL(2)\n", __func__
);
2138 if ((pd
->pf_mtag
= pf_get_mtag(m
)) == NULL
) {
2147 pd
->pf_mtag
->pftag_flags
|= PF_TAG_FRAGCACHE
;
2150 if (frag
!= NULL
&& (frag
->fr_flags
& PFFRAG_DROP
)) {
2158 /* At this point, only IP_DF is allowed in ip_off */
2159 if (h
->ip_off
& ~htons(IP_DF
)) {
2160 u_int16_t ipoff
= h
->ip_off
;
2162 h
->ip_off
&= htons(IP_DF
);
2163 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ipoff
, h
->ip_off
, 0);
2166 /* Enforce a minimum ttl, may cause endless packet loops */
2167 if (r
->min_ttl
&& h
->ip_ttl
< r
->min_ttl
) {
2168 u_int16_t ip_ttl
= h
->ip_ttl
;
2170 h
->ip_ttl
= r
->min_ttl
;
2171 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_ttl
, h
->ip_ttl
, 0);
2173 if (r
->rule_flag
& PFRULE_RANDOMID
) {
2174 u_int16_t oip_id
= h
->ip_id
;
2176 if (rfc6864
&& IP_OFF_IS_ATOMIC(ntohs(h
->ip_off
))) {
2179 h
->ip_id
= ip_randomid();
2181 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, oip_id
, h
->ip_id
, 0);
2183 if ((r
->rule_flag
& (PFRULE_FRAGCROP
| PFRULE_FRAGDROP
)) == 0) {
2184 pd
->flags
|= PFDESC_IP_REAS
;
2190 /* Enforce a minimum ttl, may cause endless packet loops */
2191 if (r
->min_ttl
&& h
->ip_ttl
< r
->min_ttl
) {
2192 u_int16_t ip_ttl
= h
->ip_ttl
;
2194 h
->ip_ttl
= r
->min_ttl
;
2195 h
->ip_sum
= pf_cksum_fixup(h
->ip_sum
, ip_ttl
, h
->ip_ttl
, 0);
2197 if ((r
->rule_flag
& (PFRULE_FRAGCROP
| PFRULE_FRAGDROP
)) == 0) {
2198 pd
->flags
|= PFDESC_IP_REAS
;
2203 REASON_SET(reason
, PFRES_MEMORY
);
2204 if (r
!= NULL
&& r
->log
&& pbuf_is_valid(pbuf
)) {
2205 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET
, dir
, *reason
, r
,
2211 REASON_SET(reason
, PFRES_NORM
);
2212 if (r
!= NULL
&& r
->log
&& pbuf_is_valid(pbuf
)) {
2213 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET
, dir
, *reason
, r
,
2219 DPFPRINTF(("dropping bad IPv4 fragment\n"));
2221 /* Free associated fragments */
2223 pf_free_fragment(frag
);
2226 REASON_SET(reason
, PFRES_FRAG
);
2227 if (r
!= NULL
&& r
->log
&& pbuf_is_valid(pbuf
)) {
2228 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET
, dir
, *reason
, r
, NULL
, NULL
, pd
);
2235 static __inline
struct pf_fragment
*
2236 pf_find_fragment_by_ipv6_header(struct ip6_hdr
*ip6
, struct ip6_frag
*fh
,
2237 struct pf_frag_tree
*tree
)
2239 struct pf_fragment key
;
2240 pf_ip6hdr2key(&key
, ip6
, fh
);
2241 return pf_find_fragment_by_key(&key
, tree
);
2245 pf_normalize_ip6(pbuf_t
*pbuf
, int dir
, struct pfi_kif
*kif
,
2246 u_short
*reason
, struct pf_pdesc
*pd
)
2248 struct mbuf
*m
= NULL
;
2250 struct ip6_hdr
*h
= pbuf
->pb_data
;
2255 struct ip6_opt_jumbo jumbo
;
2258 struct ip6_frag frag
;
2259 u_int32_t jumbolen
= 0, plen
;
2260 u_int16_t fragoff
= 0;
2263 struct pf_frent
*frent
;
2264 struct pf_fragment
*pff
= NULL
;
2265 int mff
= 0, rh_cnt
= 0;
2268 struct pf_ruleset
*ruleset
= NULL
;
2269 struct ifnet
*ifp
= pbuf
->pb_ifp
;
2271 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
2274 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
) {
2275 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
2276 } else if (r
->direction
&& r
->direction
!= dir
) {
2277 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
2278 } else if (r
->af
&& r
->af
!= AF_INET6
) {
2279 r
= r
->skip
[PF_SKIP_AF
].ptr
;
2281 #if 0 /* header chain! */
2282 else if (r
->proto
&& r
->proto
!= h
->ip6_nxt
) {
2283 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
2286 else if (PF_MISMATCHAW(&r
->src
.addr
,
2287 (struct pf_addr
*)(uintptr_t)&h
->ip6_src
, AF_INET6
,
2289 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
2290 } else if (PF_MISMATCHAW(&r
->dst
.addr
,
2291 (struct pf_addr
*)(uintptr_t)&h
->ip6_dst
, AF_INET6
,
2292 r
->dst
.neg
, NULL
)) {
2293 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
2295 if (r
->anchor
== NULL
) {
2298 pf_step_into_anchor(&asd
, &ruleset
,
2299 PF_RULESET_SCRUB
, &r
, NULL
, NULL
);
2302 if (r
== NULL
&& pf_step_out_of_anchor(&asd
, &ruleset
,
2303 PF_RULESET_SCRUB
, &r
, NULL
, NULL
)) {
2308 if (r
== NULL
|| r
->action
== PF_NOSCRUB
) {
2311 r
->packets
[dir
== PF_OUT
]++;
2312 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
2315 /* Check for illegal packets */
2316 if ((uint32_t)(sizeof(struct ip6_hdr
) + IPV6_MAXPACKET
) <
2317 pbuf
->pb_packet_len
) {
2322 off
= sizeof(struct ip6_hdr
);
2328 case IPPROTO_FRAGMENT
:
2331 case IPPROTO_ROUTING
:
2332 case IPPROTO_DSTOPTS
:
2333 if (!pf_pull_hdr(pbuf
, off
, &ext
, sizeof(ext
), NULL
,
2340 * Multiple routing headers not allowed.
2341 * Routing header type zero considered harmful.
2343 if (proto
== IPPROTO_ROUTING
) {
2344 const struct ip6_rthdr
*rh
=
2345 (const struct ip6_rthdr
*)&ext
;
2349 if (rh
->ip6r_type
== IPV6_RTHDR_TYPE_0
) {
2352 } else if (proto
== IPPROTO_AH
) {
2353 off
+= (ext
.ip6e_len
+ 2) * 4;
2355 off
+= (ext
.ip6e_len
+ 1) * 8;
2357 proto
= ext
.ip6e_nxt
;
2359 case IPPROTO_HOPOPTS
:
2360 if (!pf_pull_hdr(pbuf
, off
, &ext
, sizeof(ext
), NULL
,
2365 optend
= off
+ (ext
.ip6e_len
+ 1) * 8;
2366 ooff
= off
+ sizeof(ext
);
2368 if (!pf_pull_hdr(pbuf
, ooff
, &opt
.ip6o_type
,
2369 sizeof(opt
.ip6o_type
), NULL
, NULL
,
2373 if (opt
.ip6o_type
== IP6OPT_PAD1
) {
2377 if (!pf_pull_hdr(pbuf
, ooff
, &opt
, sizeof(opt
),
2378 NULL
, NULL
, AF_INET6
)) {
2381 if ((ooff
+ (int) sizeof(opt
) + opt
.ip6o_len
) >
2385 switch (opt
.ip6o_type
) {
2387 if (h
->ip6_plen
!= 0) {
2390 if (!pf_pull_hdr(pbuf
, ooff
, &jumbo
,
2391 sizeof(jumbo
), NULL
, NULL
,
2395 memcpy(&jumbolen
, jumbo
.ip6oj_jumbo_len
,
2397 jumbolen
= ntohl(jumbolen
);
2398 if (jumbolen
<= IPV6_MAXPACKET
) {
2401 if ((sizeof(struct ip6_hdr
) +
2402 jumbolen
) != pbuf
->pb_packet_len
) {
2409 ooff
+= sizeof(opt
) + opt
.ip6o_len
;
2410 } while (ooff
< optend
);
2413 proto
= ext
.ip6e_nxt
;
2419 } while (!terminal
);
2421 /* jumbo payload option must be present, or plen > 0 */
2422 if (ntohs(h
->ip6_plen
) == 0) {
2425 plen
= ntohs(h
->ip6_plen
);
2430 if ((uint32_t)(sizeof(struct ip6_hdr
) + plen
) > pbuf
->pb_packet_len
) {
2434 /* Enforce a minimum ttl, may cause endless packet loops */
2435 if (r
->min_ttl
&& h
->ip6_hlim
< r
->min_ttl
) {
2436 h
->ip6_hlim
= r
->min_ttl
;
2442 plen
= ntohs(h
->ip6_plen
);
2443 /* Jumbo payload packets cannot be fragmented */
2444 if (plen
== 0 || jumbolen
) {
2448 if (!pf_pull_hdr(pbuf
, off
, &frag
, sizeof(frag
), NULL
, NULL
, AF_INET6
)) {
2451 fragoff
= ntohs(frag
.ip6f_offlg
& IP6F_OFF_MASK
);
2452 pd
->proto
= frag
.ip6f_nxt
;
2453 mff
= ntohs(frag
.ip6f_offlg
& IP6F_MORE_FRAG
);
2454 off
+= sizeof(frag
);
2455 if (fragoff
+ (plen
- off
) > IPV6_MAXPACKET
) {
2459 fr_max
= fragoff
+ plen
- (off
- sizeof(struct ip6_hdr
));
2460 // XXX SCW: mbuf-specific
2461 // DPFPRINTF(("0x%llx IPv6 frag plen %u mff %d off %u fragoff %u "
2462 // "fr_max %u\n", (uint64_t)VM_KERNEL_ADDRPERM(m), plen, mff, off,
2463 // fragoff, fr_max));
2465 if ((r
->rule_flag
& (PFRULE_FRAGCROP
| PFRULE_FRAGDROP
)) == 0) {
2466 /* Fully buffer all of the fragments */
2467 pd
->flags
|= PFDESC_IP_REAS
;
2469 pff
= pf_find_fragment_by_ipv6_header(h
, &frag
,
2472 /* Check if we saw the last fragment already */
2473 if (pff
!= NULL
&& (pff
->fr_flags
& PFFRAG_SEENLAST
) &&
2474 fr_max
> pff
->fr_max
) {
2478 if ((m
= pbuf_to_mbuf(pbuf
, TRUE
)) == NULL
) {
2479 REASON_SET(reason
, PFRES_MEMORY
);
2483 /* Restore iph pointer after pbuf_to_mbuf() */
2484 h
= mtod(m
, struct ip6_hdr
*);
2486 /* Get an entry for the fragment queue */
2487 frent
= pool_get(&pf_frent_pl
, PR_NOWAIT
);
2488 if (frent
== NULL
) {
2489 REASON_SET(reason
, PFRES_MEMORY
);
2496 frent
->fr_ip6f_opt
= frag
;
2497 frent
->fr_ip6f_extoff
= extoff
;
2498 frent
->fr_ip6f_hlen
= off
;
2499 /* account for 2nd Destination Options header if present */
2500 if (pd
->proto
== IPPROTO_DSTOPTS
) {
2501 if (!pf_pull_hdr(pbuf
, off
, &ext
, sizeof(ext
), NULL
,
2505 frent
->fr_ip6f_hlen
+= (ext
.ip6e_len
+ 1) * 8;
2508 /* Might return a completely reassembled mbuf, or NULL */
2509 DPFPRINTF(("reass IPv6 frag %d @ %d-%d\n",
2510 ntohl(frag
.ip6f_ident
), fragoff
, fr_max
));
2511 m
= pf_reassemble6(&m
, &pff
, frent
, mff
);
2517 pbuf_init_mbuf(pbuf
, m
, ifp
);
2520 if (pff
!= NULL
&& (pff
->fr_flags
& PFFRAG_DROP
)) {
2523 } else if (dir
== PF_IN
||
2524 !(pd
->pf_mtag
->pftag_flags
& PF_TAG_FRAGCACHE
)) {
2525 /* non-buffering fragment cache (overlaps: see RFC 5722) */
2528 pff
= pf_find_fragment_by_ipv6_header(h
, &frag
,
2531 /* Check if we saw the last fragment already */
2532 if (pff
!= NULL
&& (pff
->fr_flags
& PFFRAG_SEENLAST
) &&
2533 fr_max
> pff
->fr_max
) {
2534 if (r
->rule_flag
& PFRULE_FRAGDROP
) {
2535 pff
->fr_flags
|= PFFRAG_DROP
;
2540 if ((m
= pbuf_to_mbuf(pbuf
, TRUE
)) == NULL
) {
2544 /* Restore iph pointer after pbuf_to_mbuf() */
2545 h
= mtod(m
, struct ip6_hdr
*);
2547 m
= pf_frag6cache(&m
, h
, &frag
, &pff
, off
, mff
,
2548 (r
->rule_flag
& PFRULE_FRAGDROP
) ? 1 : 0, &nomem
);
2550 // Note: pf_frag6cache() has already m_freem'd the mbuf
2557 pbuf_init_mbuf(pbuf
, m
, ifp
);
2558 pd
->pf_mtag
= pf_find_mtag_pbuf(pbuf
);
2562 pd
->pf_mtag
->pftag_flags
|= PF_TAG_FRAGCACHE
;
2565 if (pff
!= NULL
&& (pff
->fr_flags
& PFFRAG_DROP
)) {
2570 /* Enforce a minimum ttl, may cause endless packet loops */
2571 if (r
->min_ttl
&& h
->ip6_hlim
< r
->min_ttl
) {
2572 h
->ip6_hlim
= r
->min_ttl
;
2577 REASON_SET(reason
, PFRES_MEMORY
);
2581 REASON_SET(reason
, PFRES_SHORT
);
2585 REASON_SET(reason
, PFRES_NORM
);
2589 DPFPRINTF(("dropping bad IPv6 fragment\n"));
2590 REASON_SET(reason
, PFRES_FRAG
);
2595 pf_free_fragment(pff
);
2597 if (r
!= NULL
&& r
->log
&& pbuf_is_valid(pbuf
)) {
2598 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET6
, dir
, *reason
, r
, NULL
, NULL
, pd
);
2605 pf_normalize_tcp(int dir
, struct pfi_kif
*kif
, pbuf_t
*pbuf
, int ipoff
,
2606 int off
, void *h
, struct pf_pdesc
*pd
)
2608 #pragma unused(ipoff, h)
2609 struct pf_rule
*r
, *rm
= NULL
;
2610 struct tcphdr
*th
= pd
->hdr
.tcp
;
2615 sa_family_t af
= pd
->af
;
2616 struct pf_ruleset
*ruleset
= NULL
;
2617 union pf_state_xport sxport
, dxport
;
2619 sxport
.port
= th
->th_sport
;
2620 dxport
.port
= th
->th_dport
;
2622 r
= TAILQ_FIRST(pf_main_ruleset
.rules
[PF_RULESET_SCRUB
].active
.ptr
);
2625 if (pfi_kif_match(r
->kif
, kif
) == r
->ifnot
) {
2626 r
= r
->skip
[PF_SKIP_IFP
].ptr
;
2627 } else if (r
->direction
&& r
->direction
!= dir
) {
2628 r
= r
->skip
[PF_SKIP_DIR
].ptr
;
2629 } else if (r
->af
&& r
->af
!= af
) {
2630 r
= r
->skip
[PF_SKIP_AF
].ptr
;
2631 } else if (r
->proto
&& r
->proto
!= pd
->proto
) {
2632 r
= r
->skip
[PF_SKIP_PROTO
].ptr
;
2633 } else if (PF_MISMATCHAW(&r
->src
.addr
, pd
->src
, af
,
2635 r
= r
->skip
[PF_SKIP_SRC_ADDR
].ptr
;
2636 } else if (r
->src
.xport
.range
.op
&&
2637 !pf_match_xport(r
->src
.xport
.range
.op
, r
->proto_variant
,
2638 &r
->src
.xport
, &sxport
)) {
2639 r
= r
->skip
[PF_SKIP_SRC_PORT
].ptr
;
2640 } else if (PF_MISMATCHAW(&r
->dst
.addr
, pd
->dst
, af
,
2641 r
->dst
.neg
, NULL
)) {
2642 r
= r
->skip
[PF_SKIP_DST_ADDR
].ptr
;
2643 } else if (r
->dst
.xport
.range
.op
&&
2644 !pf_match_xport(r
->dst
.xport
.range
.op
, r
->proto_variant
,
2645 &r
->dst
.xport
, &dxport
)) {
2646 r
= r
->skip
[PF_SKIP_DST_PORT
].ptr
;
2647 } else if (r
->os_fingerprint
!= PF_OSFP_ANY
&&
2648 !pf_osfp_match(pf_osfp_fingerprint(pd
, pbuf
, off
, th
),
2649 r
->os_fingerprint
)) {
2650 r
= TAILQ_NEXT(r
, entries
);
2652 if (r
->anchor
== NULL
) {
2656 pf_step_into_anchor(&asd
, &ruleset
,
2657 PF_RULESET_SCRUB
, &r
, NULL
, NULL
);
2660 if (r
== NULL
&& pf_step_out_of_anchor(&asd
, &ruleset
,
2661 PF_RULESET_SCRUB
, &r
, NULL
, NULL
)) {
2666 if (rm
== NULL
|| rm
->action
== PF_NOSCRUB
) {
2669 r
->packets
[dir
== PF_OUT
]++;
2670 r
->bytes
[dir
== PF_OUT
] += pd
->tot_len
;
2673 if (rm
->rule_flag
& PFRULE_REASSEMBLE_TCP
) {
2674 pd
->flags
|= PFDESC_TCP_NORM
;
2677 flags
= th
->th_flags
;
2678 if (flags
& TH_SYN
) {
2679 /* Illegal packet */
2680 if (flags
& TH_RST
) {
2684 if (flags
& TH_FIN
) {
2688 /* Illegal packet */
2689 if (!(flags
& (TH_ACK
| TH_RST
))) {
2694 if (!(flags
& TH_ACK
)) {
2695 /* These flags are only valid if ACK is set */
2696 if ((flags
& TH_FIN
) || (flags
& TH_PUSH
) || (flags
& TH_URG
)) {
2701 /* Check for illegal header length */
2702 if (th
->th_off
< (sizeof(struct tcphdr
) >> 2)) {
2706 /* If flags changed, or reserved data set, then adjust */
2707 if (flags
!= th
->th_flags
|| th
->th_x2
!= 0) {
2710 ov
= *(u_int16_t
*)(&th
->th_ack
+ 1);
2711 th
->th_flags
= flags
;
2713 nv
= *(u_int16_t
*)(&th
->th_ack
+ 1);
2715 th
->th_sum
= pf_cksum_fixup(th
->th_sum
, ov
, nv
, 0);
2719 /* Remove urgent pointer, if TH_URG is not set */
2720 if (!(flags
& TH_URG
) && th
->th_urp
) {
2721 th
->th_sum
= pf_cksum_fixup(th
->th_sum
, th
->th_urp
, 0, 0);
2726 /* copy back packet headers if we sanitized */
2727 /* Process options */
2729 int rv
= pf_normalize_tcpopt(r
, dir
, kif
, pd
, pbuf
, th
, off
,
2731 if (rv
== PF_DROP
) {
2738 if (pf_lazy_makewritable(pd
, pbuf
,
2739 off
+ sizeof(*th
)) == NULL
) {
2740 REASON_SET(&reason
, PFRES_MEMORY
);
2742 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET
, dir
, reason
,
2748 pbuf_copy_back(pbuf
, off
, sizeof(*th
), th
);
2754 REASON_SET(&reason
, PFRES_NORM
);
2755 if (rm
!= NULL
&& r
->log
) {
2756 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET
, dir
, reason
, r
, NULL
, NULL
, pd
);
2762 pf_normalize_tcp_init(pbuf_t
*pbuf
, int off
, struct pf_pdesc
*pd
,
2763 struct tcphdr
*th
, struct pf_state_peer
*src
, struct pf_state_peer
*dst
)
2766 u_int32_t tsval
, tsecr
;
2770 VERIFY(src
->scrub
== NULL
);
2772 src
->scrub
= pool_get(&pf_state_scrub_pl
, PR_NOWAIT
);
2773 if (src
->scrub
== NULL
) {
2776 bzero(src
->scrub
, sizeof(*src
->scrub
));
2781 struct ip
*h
= pbuf
->pb_data
;
2782 src
->scrub
->pfss_ttl
= h
->ip_ttl
;
2788 struct ip6_hdr
*h
= pbuf
->pb_data
;
2789 src
->scrub
->pfss_ttl
= h
->ip6_hlim
;
2797 * All normalizations below are only begun if we see the start of
2798 * the connections. They must all set an enabled bit in pfss_flags
2800 if ((th
->th_flags
& TH_SYN
) == 0) {
2805 if (th
->th_off
> (sizeof(struct tcphdr
) >> 2) && src
->scrub
&&
2806 pf_pull_hdr(pbuf
, off
, hdr
, th
->th_off
<< 2, NULL
, NULL
, pd
->af
)) {
2807 /* Diddle with TCP options */
2809 opt
= hdr
+ sizeof(struct tcphdr
);
2810 hlen
= (th
->th_off
<< 2) - sizeof(struct tcphdr
);
2811 while (hlen
>= TCPOLEN_TIMESTAMP
) {
2813 case TCPOPT_EOL
: /* FALLTHROUGH */
2818 case TCPOPT_TIMESTAMP
:
2819 if (opt
[1] >= TCPOLEN_TIMESTAMP
) {
2820 src
->scrub
->pfss_flags
|=
2822 src
->scrub
->pfss_ts_mod
=
2825 /* note PFSS_PAWS not set yet */
2826 memcpy(&tsval
, &opt
[2],
2828 memcpy(&tsecr
, &opt
[6],
2830 src
->scrub
->pfss_tsval0
= ntohl(tsval
);
2831 src
->scrub
->pfss_tsval
= ntohl(tsval
);
2832 src
->scrub
->pfss_tsecr
= ntohl(tsecr
);
2833 getmicrouptime(&src
->scrub
->pfss_last
);
2837 hlen
-= MAX(opt
[1], 2);
2838 opt
+= MAX(opt
[1], 2);
2848 pf_normalize_tcp_cleanup(struct pf_state
*state
)
2850 if (state
->src
.scrub
) {
2851 pool_put(&pf_state_scrub_pl
, state
->src
.scrub
);
2853 if (state
->dst
.scrub
) {
2854 pool_put(&pf_state_scrub_pl
, state
->dst
.scrub
);
2857 /* Someday... flush the TCP segment reassembly descriptors. */
2861 pf_normalize_tcp_stateful(pbuf_t
*pbuf
, int off
, struct pf_pdesc
*pd
,
2862 u_short
*reason
, struct tcphdr
*th
, struct pf_state
*state
,
2863 struct pf_state_peer
*src
, struct pf_state_peer
*dst
, int *writeback
)
2865 struct timeval uptime
;
2866 u_int32_t tsval
= 0, tsecr
= 0;
2867 u_int tsval_from_last
;
2873 VERIFY(src
->scrub
|| dst
->scrub
);
2876 * Enforce the minimum TTL seen for this connection. Negate a common
2877 * technique to evade an intrusion detection system and confuse
2878 * firewall state code.
2884 struct ip
*h
= pbuf
->pb_data
;
2885 if (h
->ip_ttl
> src
->scrub
->pfss_ttl
) {
2886 src
->scrub
->pfss_ttl
= h
->ip_ttl
;
2888 h
->ip_ttl
= src
->scrub
->pfss_ttl
;
2896 struct ip6_hdr
*h
= pbuf
->pb_data
;
2897 if (h
->ip6_hlim
> src
->scrub
->pfss_ttl
) {
2898 src
->scrub
->pfss_ttl
= h
->ip6_hlim
;
2900 h
->ip6_hlim
= src
->scrub
->pfss_ttl
;
2907 if (th
->th_off
> (sizeof(struct tcphdr
) >> 2) &&
2908 ((src
->scrub
&& (src
->scrub
->pfss_flags
& PFSS_TIMESTAMP
)) ||
2909 (dst
->scrub
&& (dst
->scrub
->pfss_flags
& PFSS_TIMESTAMP
))) &&
2910 pf_pull_hdr(pbuf
, off
, hdr
, th
->th_off
<< 2, NULL
, NULL
, pd
->af
)) {
2911 /* Diddle with TCP options */
2913 opt
= hdr
+ sizeof(struct tcphdr
);
2914 hlen
= (th
->th_off
<< 2) - sizeof(struct tcphdr
);
2915 while (hlen
>= TCPOLEN_TIMESTAMP
) {
2917 case TCPOPT_EOL
: /* FALLTHROUGH */
2922 case TCPOPT_TIMESTAMP
:
2924 * Modulate the timestamps. Can be used for
2925 * NAT detection, OS uptime determination or
2930 /* Huh? Multiple timestamps!? */
2931 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
2932 DPFPRINTF(("multiple TS??"));
2933 pf_print_state(state
);
2936 REASON_SET(reason
, PFRES_TS
);
2939 if (opt
[1] >= TCPOLEN_TIMESTAMP
) {
2940 memcpy(&tsval
, &opt
[2],
2942 if (tsval
&& src
->scrub
&&
2943 (src
->scrub
->pfss_flags
&
2945 tsval
= ntohl(tsval
);
2946 pf_change_a(&opt
[2],
2949 src
->scrub
->pfss_ts_mod
),
2954 /* Modulate TS reply iff valid (!0) */
2955 memcpy(&tsecr
, &opt
[6],
2957 if (tsecr
&& dst
->scrub
&&
2958 (dst
->scrub
->pfss_flags
&
2960 tsecr
= ntohl(tsecr
)
2961 - dst
->scrub
->pfss_ts_mod
;
2962 pf_change_a(&opt
[6],
2963 &th
->th_sum
, htonl(tsecr
),
2971 hlen
-= MAX(opt
[1], 2);
2972 opt
+= MAX(opt
[1], 2);
2977 /* Copyback the options, caller copys back header */
2978 int optoff
= off
+ sizeof(*th
);
2979 int optlen
= (th
->th_off
<< 2) - sizeof(*th
);
2980 if (pf_lazy_makewritable(pd
, pbuf
, optoff
+ optlen
) ==
2982 REASON_SET(reason
, PFRES_MEMORY
);
2985 *writeback
= optoff
+ optlen
;
2986 pbuf_copy_back(pbuf
, optoff
, optlen
, hdr
+ sizeof(*th
));
2992 * Must invalidate PAWS checks on connections idle for too long.
2993 * The fastest allowed timestamp clock is 1ms. That turns out to
2994 * be about 24 days before it wraps. XXX Right now our lowerbound
2995 * TS echo check only works for the first 12 days of a connection
2996 * when the TS has exhausted half its 32bit space
2998 #define TS_MAX_IDLE (24*24*60*60)
2999 #define TS_MAX_CONN (12*24*60*60) /* XXX remove when better tsecr check */
3001 getmicrouptime(&uptime
);
3002 if (src
->scrub
&& (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
3003 (uptime
.tv_sec
- src
->scrub
->pfss_last
.tv_sec
> TS_MAX_IDLE
||
3004 pf_time_second() - state
->creation
> TS_MAX_CONN
)) {
3005 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
3006 DPFPRINTF(("src idled out of PAWS\n"));
3007 pf_print_state(state
);
3010 src
->scrub
->pfss_flags
= (src
->scrub
->pfss_flags
& ~PFSS_PAWS
)
3013 if (dst
->scrub
&& (dst
->scrub
->pfss_flags
& PFSS_PAWS
) &&
3014 uptime
.tv_sec
- dst
->scrub
->pfss_last
.tv_sec
> TS_MAX_IDLE
) {
3015 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
3016 DPFPRINTF(("dst idled out of PAWS\n"));
3017 pf_print_state(state
);
3020 dst
->scrub
->pfss_flags
= (dst
->scrub
->pfss_flags
& ~PFSS_PAWS
)
3024 if (got_ts
&& src
->scrub
&& dst
->scrub
&&
3025 (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
3026 (dst
->scrub
->pfss_flags
& PFSS_PAWS
)) {
3028 * Validate that the timestamps are "in-window".
3029 * RFC1323 describes TCP Timestamp options that allow
3030 * measurement of RTT (round trip time) and PAWS
3031 * (protection against wrapped sequence numbers). PAWS
3032 * gives us a set of rules for rejecting packets on
3033 * long fat pipes (packets that were somehow delayed
3034 * in transit longer than the time it took to send the
3035 * full TCP sequence space of 4Gb). We can use these
3036 * rules and infer a few others that will let us treat
3037 * the 32bit timestamp and the 32bit echoed timestamp
3038 * as sequence numbers to prevent a blind attacker from
3039 * inserting packets into a connection.
3042 * - The timestamp on this packet must be greater than
3043 * or equal to the last value echoed by the other
3044 * endpoint. The RFC says those will be discarded
3045 * since it is a dup that has already been acked.
3046 * This gives us a lowerbound on the timestamp.
3047 * timestamp >= other last echoed timestamp
3048 * - The timestamp will be less than or equal to
3049 * the last timestamp plus the time between the
3050 * last packet and now. The RFC defines the max
3051 * clock rate as 1ms. We will allow clocks to be
3052 * up to 10% fast and will allow a total difference
3053 * or 30 seconds due to a route change. And this
3054 * gives us an upperbound on the timestamp.
3055 * timestamp <= last timestamp + max ticks
3056 * We have to be careful here. Windows will send an
3057 * initial timestamp of zero and then initialize it
3058 * to a random value after the 3whs; presumably to
3059 * avoid a DoS by having to call an expensive RNG
3060 * during a SYN flood. Proof MS has at least one
3061 * good security geek.
3063 * - The TCP timestamp option must also echo the other
3064 * endpoints timestamp. The timestamp echoed is the
3065 * one carried on the earliest unacknowledged segment
3066 * on the left edge of the sequence window. The RFC
3067 * states that the host will reject any echoed
3068 * timestamps that were larger than any ever sent.
3069 * This gives us an upperbound on the TS echo.
3070 * tescr <= largest_tsval
3071 * - The lowerbound on the TS echo is a little more
3072 * tricky to determine. The other endpoint's echoed
3073 * values will not decrease. But there may be
3074 * network conditions that re-order packets and
3075 * cause our view of them to decrease. For now the
3076 * only lowerbound we can safely determine is that
3077 * the TS echo will never be less than the original
3078 * TS. XXX There is probably a better lowerbound.
3079 * Remove TS_MAX_CONN with better lowerbound check.
3080 * tescr >= other original TS
3082 * It is also important to note that the fastest
3083 * timestamp clock of 1ms will wrap its 32bit space in
3084 * 24 days. So we just disable TS checking after 24
3085 * days of idle time. We actually must use a 12d
3086 * connection limit until we can come up with a better
3087 * lowerbound to the TS echo check.
3089 struct timeval delta_ts
;
3094 * PFTM_TS_DIFF is how many seconds of leeway to allow
3095 * a host's timestamp. This can happen if the previous
3096 * packet got delayed in transit for much longer than
3099 if ((ts_fudge
= state
->rule
.ptr
->timeout
[PFTM_TS_DIFF
]) == 0) {
3100 ts_fudge
= pf_default_rule
.timeout
[PFTM_TS_DIFF
];
3104 /* Calculate max ticks since the last timestamp */
3105 #define TS_MAXFREQ 1100 /* RFC max TS freq of 1Khz + 10% skew */
3106 #define TS_MICROSECS 1000000 /* microseconds per second */
3107 timersub(&uptime
, &src
->scrub
->pfss_last
, &delta_ts
);
3108 tsval_from_last
= (delta_ts
.tv_sec
+ ts_fudge
) * TS_MAXFREQ
;
3109 tsval_from_last
+= delta_ts
.tv_usec
/ (TS_MICROSECS
/ TS_MAXFREQ
);
3112 if ((src
->state
>= TCPS_ESTABLISHED
&&
3113 dst
->state
>= TCPS_ESTABLISHED
) &&
3114 (SEQ_LT(tsval
, dst
->scrub
->pfss_tsecr
) ||
3115 SEQ_GT(tsval
, src
->scrub
->pfss_tsval
+ tsval_from_last
) ||
3116 (tsecr
&& (SEQ_GT(tsecr
, dst
->scrub
->pfss_tsval
) ||
3117 SEQ_LT(tsecr
, dst
->scrub
->pfss_tsval0
))))) {
3119 * Bad RFC1323 implementation or an insertion attack.
3121 * - Solaris 2.6 and 2.7 are known to send another ACK
3122 * after the FIN,FIN|ACK,ACK closing that carries
3126 DPFPRINTF(("Timestamp failed %c%c%c%c\n",
3127 SEQ_LT(tsval
, dst
->scrub
->pfss_tsecr
) ? '0' : ' ',
3128 SEQ_GT(tsval
, src
->scrub
->pfss_tsval
+
3129 tsval_from_last
) ? '1' : ' ',
3130 SEQ_GT(tsecr
, dst
->scrub
->pfss_tsval
) ? '2' : ' ',
3131 SEQ_LT(tsecr
, dst
->scrub
->pfss_tsval0
)? '3' : ' '));
3132 DPFPRINTF((" tsval: %u tsecr: %u +ticks: %u "
3133 "idle: %lus %ums\n",
3134 tsval
, tsecr
, tsval_from_last
, delta_ts
.tv_sec
,
3135 delta_ts
.tv_usec
/ 1000));
3136 DPFPRINTF((" src->tsval: %u tsecr: %u\n",
3137 src
->scrub
->pfss_tsval
, src
->scrub
->pfss_tsecr
));
3138 DPFPRINTF((" dst->tsval: %u tsecr: %u tsval0: %u\n",
3139 dst
->scrub
->pfss_tsval
, dst
->scrub
->pfss_tsecr
,
3140 dst
->scrub
->pfss_tsval0
));
3141 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
3142 pf_print_state(state
);
3143 pf_print_flags(th
->th_flags
);
3146 REASON_SET(reason
, PFRES_TS
);
3150 /* XXX I'd really like to require tsecr but it's optional */
3151 } else if (!got_ts
&& (th
->th_flags
& TH_RST
) == 0 &&
3152 ((src
->state
== TCPS_ESTABLISHED
&& dst
->state
== TCPS_ESTABLISHED
)
3153 || pd
->p_len
> 0 || (th
->th_flags
& TH_SYN
)) &&
3154 src
->scrub
&& dst
->scrub
&&
3155 (src
->scrub
->pfss_flags
& PFSS_PAWS
) &&
3156 (dst
->scrub
->pfss_flags
& PFSS_PAWS
)) {
3158 * Didn't send a timestamp. Timestamps aren't really useful
3160 * - connection opening or closing (often not even sent).
3161 * but we must not let an attacker to put a FIN on a
3162 * data packet to sneak it through our ESTABLISHED check.
3163 * - on a TCP reset. RFC suggests not even looking at TS.
3164 * - on an empty ACK. The TS will not be echoed so it will
3165 * probably not help keep the RTT calculation in sync and
3166 * there isn't as much danger when the sequence numbers
3167 * got wrapped. So some stacks don't include TS on empty
3170 * To minimize the disruption to mostly RFC1323 conformant
3171 * stacks, we will only require timestamps on data packets.
3173 * And what do ya know, we cannot require timestamps on data
3174 * packets. There appear to be devices that do legitimate
3175 * TCP connection hijacking. There are HTTP devices that allow
3176 * a 3whs (with timestamps) and then buffer the HTTP request.
3177 * If the intermediate device has the HTTP response cache, it
3178 * will spoof the response but not bother timestamping its
3179 * packets. So we can look for the presence of a timestamp in
3180 * the first data packet and if there, require it in all future
3184 if (pd
->p_len
> 0 && (src
->scrub
->pfss_flags
& PFSS_DATA_TS
)) {
3186 * Hey! Someone tried to sneak a packet in. Or the
3187 * stack changed its RFC1323 behavior?!?!
3189 if (pf_status
.debug
>= PF_DEBUG_MISC
) {
3190 DPFPRINTF(("Did not receive expected RFC1323 "
3192 pf_print_state(state
);
3193 pf_print_flags(th
->th_flags
);
3196 REASON_SET(reason
, PFRES_TS
);
3203 * We will note if a host sends his data packets with or without
3204 * timestamps. And require all data packets to contain a timestamp
3205 * if the first does. PAWS implicitly requires that all data packets be
3206 * timestamped. But I think there are middle-man devices that hijack
3207 * TCP streams immediately after the 3whs and don't timestamp their
3208 * packets (seen in a WWW accelerator or cache).
3210 if (pd
->p_len
> 0 && src
->scrub
&& (src
->scrub
->pfss_flags
&
3211 (PFSS_TIMESTAMP
| PFSS_DATA_TS
| PFSS_DATA_NOTS
)) == PFSS_TIMESTAMP
) {
3213 src
->scrub
->pfss_flags
|= PFSS_DATA_TS
;
3215 src
->scrub
->pfss_flags
|= PFSS_DATA_NOTS
;
3216 if (pf_status
.debug
>= PF_DEBUG_MISC
&& dst
->scrub
&&
3217 (dst
->scrub
->pfss_flags
& PFSS_TIMESTAMP
)) {
3218 /* Don't warn if other host rejected RFC1323 */
3219 DPFPRINTF(("Broken RFC1323 stack did not "
3220 "timestamp data packet. Disabled PAWS "
3222 pf_print_state(state
);
3223 pf_print_flags(th
->th_flags
);
3231 * Update PAWS values
3233 if (got_ts
&& src
->scrub
&& PFSS_TIMESTAMP
== (src
->scrub
->pfss_flags
&
3234 (PFSS_PAWS_IDLED
| PFSS_TIMESTAMP
))) {
3235 getmicrouptime(&src
->scrub
->pfss_last
);
3236 if (SEQ_GEQ(tsval
, src
->scrub
->pfss_tsval
) ||
3237 (src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0) {
3238 src
->scrub
->pfss_tsval
= tsval
;
3242 if (SEQ_GEQ(tsecr
, src
->scrub
->pfss_tsecr
) ||
3243 (src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0) {
3244 src
->scrub
->pfss_tsecr
= tsecr
;
3247 if ((src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0 &&
3248 (SEQ_LT(tsval
, src
->scrub
->pfss_tsval0
) ||
3249 src
->scrub
->pfss_tsval0
== 0)) {
3250 /* tsval0 MUST be the lowest timestamp */
3251 src
->scrub
->pfss_tsval0
= tsval
;
3254 /* Only fully initialized after a TS gets echoed */
3255 if ((src
->scrub
->pfss_flags
& PFSS_PAWS
) == 0) {
3256 src
->scrub
->pfss_flags
|= PFSS_PAWS
;
3261 /* I have a dream.... TCP segment reassembly.... */
3266 pf_normalize_tcpopt(struct pf_rule
*r
, int dir
, struct pfi_kif
*kif
,
3267 struct pf_pdesc
*pd
, pbuf_t
*pbuf
, struct tcphdr
*th
, int off
,
3270 #pragma unused(dir, kif)
3271 sa_family_t af
= pd
->af
;
3274 int opt
, cnt
, optlen
= 0;
3276 u_char opts
[MAX_TCPOPTLEN
];
3277 u_char
*optp
= opts
;
3279 thoff
= th
->th_off
<< 2;
3280 cnt
= thoff
- sizeof(struct tcphdr
);
3282 if (cnt
> 0 && !pf_pull_hdr(pbuf
, off
+ sizeof(*th
), opts
, cnt
,
3287 for (; cnt
> 0; cnt
-= optlen
, optp
+= optlen
) {
3289 if (opt
== TCPOPT_EOL
) {
3292 if (opt
== TCPOPT_NOP
) {
3299 if (optlen
< 2 || optlen
> cnt
) {
3305 mss
= (u_int16_t
*)(void *)(optp
+ 2);
3306 if ((ntohs(*mss
)) > r
->max_mss
) {
3309 * Only do the TCP checksum fixup if delayed
3310 * checksum calculation will not be performed.
3313 !(*pbuf
->pb_csum_flags
& CSUM_TCP
)) {
3314 th
->th_sum
= pf_cksum_fixup(th
->th_sum
,
3315 *mss
, htons(r
->max_mss
), 0);
3317 *mss
= htons(r
->max_mss
);
3329 VERIFY(pbuf
== pd
->mp
);
3331 if (pf_lazy_makewritable(pd
, pd
->mp
,
3332 off
+ sizeof(*th
) + thoff
) == NULL
) {
3333 REASON_SET(&reason
, PFRES_MEMORY
);
3335 PFLOG_PACKET(kif
, h
, pbuf
, AF_INET
, dir
, reason
,
3342 pbuf_copy_back(pd
->mp
, off
+ sizeof(*th
), thoff
- sizeof(*th
), opts
);