2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993
30 * The Regents of the University of California. All rights reserved.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
61 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/kernel.h>
75 #include <sys/malloc.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <kern/locks.h>
81 #include <sys/sysctl.h>
82 #include <sys/mcache.h>
84 #include <machine/endian.h>
85 #include <pexpert/pexpert.h>
88 #include <net/if_dl.h>
89 #include <net/if_types.h>
90 #include <net/route.h>
91 #include <net/ntstat.h>
92 #include <net/net_osdep.h>
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/ip.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_var.h>
99 #include <netinet/ip_var.h>
101 #include <netinet/kpi_ipfilter_var.h>
104 #include <security/mac_framework.h>
109 #include <net/dlil.h>
110 #include <sys/kdebug.h>
111 #include <libkern/OSAtomic.h>
113 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1)
114 #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3)
115 #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
116 #define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
118 #define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8))
121 #include <netinet6/ipsec.h>
122 #include <netkey/key.h>
124 #include <netkey/key_debug.h>
126 #define KEYDEBUG(lev,arg)
130 #include <netinet/ip_fw.h>
131 #include <netinet/ip_divert.h>
132 #include <mach/sdt.h>
135 #include <netinet/ip_dummynet.h>
139 #include <net/pfvar.h>
142 #if IPFIREWALL_FORWARD_DEBUG
143 #define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\
144 (ntohl(a.s_addr)>>16)&0xFF,\
145 (ntohl(a.s_addr)>>8)&0xFF,\
146 (ntohl(a.s_addr))&0xFF);
152 static struct mbuf
*ip_insertoptions(struct mbuf
*, struct mbuf
*, int *);
153 static void ip_mloopback(struct ifnet
*, struct mbuf
*,
154 struct sockaddr_in
*, int);
155 static int ip_pcbopts(int, struct mbuf
**, struct mbuf
*);
156 static void imo_trace(struct ip_moptions
*, int);
158 static void ip_out_cksum_stats(int, u_int32_t
);
159 static struct ifaddr
*in_selectsrcif(struct ip
*, struct route
*, unsigned int);
161 int ip_optcopy(struct ip
*, struct ip
*);
162 void in_delayed_cksum_offset(struct mbuf
*, int );
163 void in_cksum_offset(struct mbuf
* , size_t );
165 extern int (*fr_checkp
)(struct ip
*, int, struct ifnet
*, int, struct mbuf
**);
167 extern struct protosw inetsw
[];
169 extern struct ip_linklocal_stat ip_linklocal_stat
;
170 extern lck_mtx_t
*ip_mutex
;
172 /* temporary: for testing */
174 extern int ipsec_bypass
;
177 static int ip_maxchainsent
= 0;
178 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, maxchainsent
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
179 &ip_maxchainsent
, 0, "use dlil_output_list");
181 static int forge_ce
= 0;
182 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, forge_ce
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
183 &forge_ce
, 0, "Forge ECN CE");
186 static int ip_select_srcif_debug
= 0;
187 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, select_srcif_debug
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
188 &ip_select_srcif_debug
, 0, "log source interface selection debug info");
190 #define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
193 __private_extern__
unsigned int imo_trace_hist_size
= IMO_TRACE_HIST_SIZE
;
195 struct ip_moptions_dbg
{
196 struct ip_moptions imo
; /* ip_moptions */
197 u_int16_t imo_refhold_cnt
; /* # of IMO_ADDREF */
198 u_int16_t imo_refrele_cnt
; /* # of IMO_REMREF */
200 * Alloc and free callers.
205 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
207 ctrace_t imo_refhold
[IMO_TRACE_HIST_SIZE
];
208 ctrace_t imo_refrele
[IMO_TRACE_HIST_SIZE
];
212 static unsigned int imo_debug
= 1; /* debugging (enabled) */
214 static unsigned int imo_debug
; /* debugging (disabled) */
216 static unsigned int imo_size
; /* size of zone element */
217 static struct zone
*imo_zone
; /* zone for ip_moptions */
219 #define IMO_ZONE_MAX 64 /* maximum elements in zone */
220 #define IMO_ZONE_NAME "ip_moptions" /* zone name */
223 * IP output. The packet in mbuf chain m contains a skeletal IP
224 * header (with len, off, ttl, proto, tos, src, dst).
225 * The mbuf chain containing the packet will be freed.
226 * The mbuf opt, if present, will not be freed.
234 struct ip_moptions
*imo
,
235 struct ip_out_args
*ipoa
)
238 error
= ip_output_list(m0
, 0, opt
, ro
, flags
, imo
, ipoa
);
251 * ipsec4_getpolicybyaddr:??? [IPSEC 4th argument, contents modified]
252 * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified]
253 * key_spdacquire:??? [IPSEC]
254 * ipsec4_output:??? [IPSEC]
255 * <fr_checkp>:??? [firewall]
256 * ip_dn_io_ptr:??? [dummynet]
257 * dlil_output:??? [DLIL]
258 * dlil_output_list:??? [DLIL]
260 * Notes: The ipsec4_getpolicyby{addr|sock} function error returns are
261 * only used as the error return from this function where one of
262 * these functions fails to return a policy.
271 struct ip_moptions
*imo
,
272 struct ip_out_args
*ipoa
276 struct ifnet
*ifp
= NULL
;
277 struct mbuf
*m
= m0
, **mppn
= NULL
;
278 int hlen
= sizeof (struct ip
);
279 int len
= 0, error
= 0;
280 struct sockaddr_in
*dst
= NULL
;
281 struct in_ifaddr
*ia
= NULL
, *src_ia
= NULL
;
282 int isbroadcast
, sw_csum
;
283 struct in_addr pkt_dst
;
284 struct ipf_pktopts
*ippo
= NULL
, ipf_pktopts
;
286 struct route iproute
;
287 struct socket
*so
= NULL
;
288 struct secpolicy
*sp
= NULL
;
290 #if IPFIREWALL_FORWARD
291 int fwd_rewrite_src
= 0;
295 struct ip_fw_args args
;
297 struct sockaddr_in
*next_hop_from_ipfwd_tag
= NULL
;
300 ipfilter_t inject_filter_ref
= 0;
302 struct route saved_route
;
303 struct ip_out_args saved_ipoa
;
304 struct sockaddr_in dst_buf
;
305 #endif /* DUMMYNET */
306 struct mbuf
* packetlist
;
307 int pktcnt
= 0, tso
= 0;
308 u_int32_t bytecnt
= 0;
309 unsigned int ifscope
;
311 boolean_t select_srcif
;
312 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT
| DBG_FUNC_START
, 0,0,0,0,0);
316 args
.next_hop
= NULL
;
319 args
.divert_rule
= 0; /* divert cookie */
322 if (SLIST_EMPTY(&m0
->m_pkthdr
.tags
))
325 /* Grab info from mtags prepended to the chain */
327 if ((tag
= m_tag_locate(m0
, KERNEL_MODULE_TAG_ID
,
328 KERNEL_TAG_TYPE_DUMMYNET
, NULL
)) != NULL
) {
329 struct dn_pkt_tag
*dn_tag
;
331 dn_tag
= (struct dn_pkt_tag
*)(tag
+1);
332 args
.rule
= dn_tag
->rule
;
334 saved_route
= dn_tag
->ro
;
338 bcopy(&dn_tag
->dn_dst
, &dst_buf
, sizeof(dst_buf
));
341 flags
= dn_tag
->flags
;
342 saved_ipoa
= dn_tag
->ipoa
;
345 m_tag_delete(m0
, tag
);
347 #endif /* DUMMYNET */
350 if ((tag
= m_tag_locate(m0
, KERNEL_MODULE_TAG_ID
,
351 KERNEL_TAG_TYPE_DIVERT
, NULL
)) != NULL
) {
352 struct divert_tag
*div_tag
;
354 div_tag
= (struct divert_tag
*)(tag
+1);
355 args
.divert_rule
= div_tag
->cookie
;
357 m_tag_delete(m0
, tag
);
359 #endif /* IPDIVERT */
361 if ((tag
= m_tag_locate(m0
, KERNEL_MODULE_TAG_ID
,
362 KERNEL_TAG_TYPE_IPFORWARD
, NULL
)) != NULL
) {
363 struct ip_fwd_tag
*ipfwd_tag
;
365 ipfwd_tag
= (struct ip_fwd_tag
*)(tag
+1);
366 next_hop_from_ipfwd_tag
= ipfwd_tag
->next_hop
;
368 m_tag_delete(m0
, tag
);
371 #endif /* IPFIREWALL */
376 if ( !m
|| (m
->m_flags
& M_PKTHDR
) != 0)
377 panic("ip_output no HDR");
379 panic("ip_output no route, proto = %d",
380 mtod(m
, struct ip
*)->ip_p
);
383 bzero(&ipf_pktopts
, sizeof(struct ipf_pktopts
));
387 * At present the IP_OUTARGS flag implies a request for IP to
388 * perform source interface selection. In the forwarding case,
389 * only the ifscope value is used, as source interface selection
390 * doesn't take place.
392 if (ip_doscopedroute
&& (flags
& IP_OUTARGS
)) {
393 select_srcif
= !(flags
& IP_FORWARDING
);
394 ifscope
= ipoa
->ipoa_boundif
;
395 ipf_pktopts
.ippo_flags
= IPPOF_BOUND_IF
;
396 ipf_pktopts
.ippo_flags
|= (ifscope
<< IPPOF_SHIFT_IFSCOPE
);
398 select_srcif
= FALSE
;
399 ifscope
= IFSCOPE_NONE
;
402 if (flags
& IP_OUTARGS
) {
403 nocell
= ipoa
->ipoa_nocell
;
405 ipf_pktopts
.ippo_flags
|= IPPOF_NO_IFT_CELLULAR
;
411 if (args
.rule
!= NULL
) { /* dummynet already saw us */
412 ip
= mtod(m
, struct ip
*);
413 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2 ;
414 if (ro
->ro_rt
!= NULL
) {
415 RT_LOCK_SPIN(ro
->ro_rt
);
416 ia
= (struct in_ifaddr
*)ro
->ro_rt
->rt_ifa
;
418 /* Become a regular mutex */
419 RT_CONVERT_LOCK(ro
->ro_rt
);
420 IFA_ADDREF(&ia
->ia_ifa
);
422 RT_UNLOCK(ro
->ro_rt
);
425 if (ipsec_bypass
== 0 && (flags
& IP_NOIPSEC
) == 0) {
426 so
= ipsec_getsocket(m
);
427 (void)ipsec_setsocket(m
, NULL
);
432 #endif /* IPFIREWALL */
435 if (ipsec_bypass
== 0 && (flags
& IP_NOIPSEC
) == 0) {
436 so
= ipsec_getsocket(m
);
437 (void)ipsec_setsocket(m
, NULL
);
442 * No need to proccess packet twice if we've
445 if (!SLIST_EMPTY(&m
->m_pkthdr
.tags
))
446 inject_filter_ref
= ipf_get_inject_filter(m
);
448 inject_filter_ref
= 0;
451 m
= ip_insertoptions(m
, opt
, &len
);
454 ip
= mtod(m
, struct ip
*);
459 * When dealing with a packet chain, we need to reset "next_hop" because
460 * "dst" may have been changed to the gateway address below for the previous
461 * packet of the chain. This could cause the route to be inavertandly changed
462 * to the route to the gateway address (instead of the route to the destination).
464 args
.next_hop
= next_hop_from_ipfwd_tag
;
465 pkt_dst
= args
.next_hop
? args
.next_hop
->sin_addr
: ip
->ip_dst
;
467 pkt_dst
= ip
->ip_dst
;
471 * We must not send if the packet is destined to network zero.
472 * RFC1122 3.2.1.3 (a) and (b).
474 if (IN_ZERONET(ntohl(pkt_dst
.s_addr
))) {
475 error
= EHOSTUNREACH
;
482 if ((flags
& (IP_FORWARDING
|IP_RAWOUTPUT
)) == 0) {
483 ip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, hlen
>> 2);
486 ip
->ip_id
= ip_randomid();
488 ip
->ip_id
= htons(ip_id
++);
490 OSAddAtomic(1, &ipstat
.ips_localout
);
492 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
496 /* For debugging, we let the stack forge congestion */
498 ((ip
->ip_tos
& IPTOS_ECN_MASK
) == IPTOS_ECN_ECT1
||
499 (ip
->ip_tos
& IPTOS_ECN_MASK
) == IPTOS_ECN_ECT0
)) {
500 ip
->ip_tos
= (ip
->ip_tos
& ~IPTOS_ECN_MASK
) | IPTOS_ECN_CE
;
505 KERNEL_DEBUG(DBG_LAYER_BEG
, ip
->ip_dst
.s_addr
,
506 ip
->ip_src
.s_addr
, ip
->ip_p
, ip
->ip_off
, ip
->ip_len
);
508 dst
= (struct sockaddr_in
*)&ro
->ro_dst
;
511 * If there is a cached route,
512 * check that it is to the same destination
513 * and is still up. If not, free it and try again.
514 * The address family should also be checked in case of sharing the
518 if (ro
->ro_rt
!= NULL
) {
519 if (ro
->ro_rt
->generation_id
!= route_generation
&&
520 ((flags
& (IP_ROUTETOIF
| IP_FORWARDING
)) == 0) &&
521 (ip
->ip_src
.s_addr
!= INADDR_ANY
)) {
522 src_ia
= ifa_foraddr(ip
->ip_src
.s_addr
);
523 if (src_ia
== NULL
) {
524 error
= EADDRNOTAVAIL
;
527 IFA_REMREF(&src_ia
->ia_ifa
);
530 * Test rt_flags without holding rt_lock for performance
531 * reasons; if the route is down it will hopefully be
532 * caught by the layer below (since it uses this route
533 * as a hint) or during the next transmit.
535 if ((ro
->ro_rt
->rt_flags
& RTF_UP
) == 0 ||
536 dst
->sin_family
!= AF_INET
||
537 dst
->sin_addr
.s_addr
!= pkt_dst
.s_addr
) {
542 * If we're doing source interface selection, we may not
543 * want to use this route; only synch up the generation
546 if (!select_srcif
&& ro
->ro_rt
!= NULL
&&
547 ro
->ro_rt
->generation_id
!= route_generation
)
548 ro
->ro_rt
->generation_id
= route_generation
;
550 if (ro
->ro_rt
== NULL
) {
551 bzero(dst
, sizeof(*dst
));
552 dst
->sin_family
= AF_INET
;
553 dst
->sin_len
= sizeof(*dst
);
554 dst
->sin_addr
= pkt_dst
;
557 * If routing to interface only,
558 * short circuit routing lookup.
560 #define ifatoia(ifa) ((struct in_ifaddr *)(ifa))
561 #define sintosa(sin) ((struct sockaddr *)(sin))
562 if (flags
& IP_ROUTETOIF
) {
564 IFA_REMREF(&ia
->ia_ifa
);
565 if ((ia
= ifatoia(ifa_ifwithdstaddr(sintosa(dst
)))) == 0) {
566 if ((ia
= ifatoia(ifa_ifwithnet(sintosa(dst
)))) == 0) {
567 OSAddAtomic(1, &ipstat
.ips_noroute
);
574 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
575 } else if (IN_MULTICAST(ntohl(pkt_dst
.s_addr
)) &&
576 imo
!= NULL
&& (ifp
= imo
->imo_multicast_ifp
) != NULL
) {
578 * Bypass the normal routing lookup for multicast
579 * packets if the interface is specified.
583 IFA_REMREF(&ia
->ia_ifa
);
585 /* Macro takes reference on ia */
588 boolean_t cloneok
= FALSE
;
590 * Perform source interface selection; the source IP address
591 * must belong to one of the addresses of the interface used
592 * by the route. For performance reasons, do this only if
593 * there is no route, or if the routing table has changed,
594 * or if we haven't done source interface selection on this
595 * route (for this PCB instance) before.
597 if (select_srcif
&& ip
->ip_src
.s_addr
!= INADDR_ANY
&&
598 (ro
->ro_rt
== NULL
|| !(ro
->ro_rt
->rt_flags
& RTF_UP
) ||
599 ro
->ro_rt
->generation_id
!= route_generation
||
600 !(ro
->ro_flags
& ROF_SRCIF_SELECTED
))) {
603 /* Find the source interface */
604 ifa
= in_selectsrcif(ip
, ro
, ifscope
);
607 * If the source address belongs to a cellular interface
608 * and the caller forbids our using interfaces of such
609 * type, pretend that there is no source address.
611 if (nocell
&& ifa
!= NULL
&&
612 ifa
->ifa_ifp
->if_type
== IFT_CELLULAR
) {
614 error
= EADDRNOTAVAIL
;
619 * If the source address is spoofed (in the case
620 * of IP_RAWOUTPUT), or if this is destined for
621 * local/loopback, just let it go out using the
622 * interface of the route. Otherwise, there's no
623 * interface having such an address, so bail out.
625 if (ifa
== NULL
&& !(flags
& IP_RAWOUTPUT
) &&
626 ifscope
!= lo_ifp
->if_index
) {
627 error
= EADDRNOTAVAIL
;
632 * If the caller didn't explicitly specify the scope,
633 * pick it up from the source interface. If the cached
634 * route was wrong and was blown away as part of source
635 * interface selection, don't mask out RTF_PRCLONING
636 * since that route may have been allocated by the ULP,
637 * unless the IP header was created by the caller or
638 * the destination is IPv4 LLA. The check for the
639 * latter is needed because IPv4 LLAs are never scoped
640 * in the current implementation, and we don't want to
641 * replace the resolved IPv4 LLA route with one whose
642 * gateway points to that of the default gateway on
643 * the primary interface of the system.
646 if (ifscope
== IFSCOPE_NONE
)
647 ifscope
= ifa
->ifa_ifp
->if_index
;
649 cloneok
= (!(flags
& IP_RAWOUTPUT
) &&
650 !(IN_LINKLOCAL(ntohl(ip
->ip_dst
.s_addr
))));
655 * If this is the case, we probably don't want to allocate
656 * a protocol-cloned route since we didn't get one from the
657 * ULP. This lets TCP do its thing, while not burdening
658 * forwarding or ICMP with the overhead of cloning a route.
659 * Of course, we still want to do any cloning requested by
660 * the link layer, as this is probably required in all cases
661 * for correct operation (as it is for ARP).
663 if (ro
->ro_rt
== NULL
) {
664 unsigned long ign
= RTF_PRCLONING
;
666 * We make an exception here: if the destination
667 * address is INADDR_BROADCAST, allocate a protocol-
668 * cloned host route so that we end up with a route
669 * marked with the RTF_BROADCAST flag. Otherwise,
670 * we would end up referring to the default route,
671 * instead of creating a cloned host route entry.
672 * That would introduce inconsistencies between ULPs
673 * that allocate a route and those that don't. The
674 * RTF_BROADCAST route is important since we'd want
675 * to send out undirected IP broadcast packets using
676 * link-level broadcast address. Another exception
677 * is for ULP-created routes that got blown away by
678 * source interface selection (see above).
680 * These exceptions will no longer be necessary when
681 * the RTF_PRCLONING scheme is no longer present.
683 if (cloneok
|| dst
->sin_addr
.s_addr
== INADDR_BROADCAST
)
684 ign
&= ~RTF_PRCLONING
;
687 * Loosen the route lookup criteria if the ifscope
688 * corresponds to the loopback interface; this is
689 * needed to support Application Layer Gateways
690 * listening on loopback, in conjunction with packet
691 * filter redirection rules. The final source IP
692 * address will be rewritten by the packet filter
693 * prior to the RFC1122 loopback check below.
695 if (ifscope
== lo_ifp
->if_index
)
696 rtalloc_ign(ro
, ign
);
698 rtalloc_scoped_ign(ro
, ign
, ifscope
);
701 * If the route points to a cellular interface and the
702 * caller forbids our using interfaces of such type,
703 * pretend that there is no route.
705 if (nocell
&& ro
->ro_rt
!= NULL
) {
706 RT_LOCK_SPIN(ro
->ro_rt
);
707 if (ro
->ro_rt
->rt_ifp
->if_type
==
709 RT_UNLOCK(ro
->ro_rt
);
713 RT_UNLOCK(ro
->ro_rt
);
718 if (ro
->ro_rt
== NULL
) {
719 OSAddAtomic(1, &ipstat
.ips_noroute
);
720 error
= EHOSTUNREACH
;
725 IFA_REMREF(&ia
->ia_ifa
);
726 RT_LOCK_SPIN(ro
->ro_rt
);
727 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
729 /* Become a regular mutex */
730 RT_CONVERT_LOCK(ro
->ro_rt
);
731 IFA_ADDREF(&ia
->ia_ifa
);
733 ifp
= ro
->ro_rt
->rt_ifp
;
735 if (ro
->ro_rt
->rt_flags
& RTF_GATEWAY
)
736 dst
= (struct sockaddr_in
*)ro
->ro_rt
->rt_gateway
;
737 if (ro
->ro_rt
->rt_flags
& RTF_HOST
) {
738 isbroadcast
= (ro
->ro_rt
->rt_flags
& RTF_BROADCAST
);
740 /* Become a regular mutex */
741 RT_CONVERT_LOCK(ro
->ro_rt
);
742 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
744 RT_UNLOCK(ro
->ro_rt
);
747 if (IN_MULTICAST(ntohl(pkt_dst
.s_addr
))) {
748 struct in_multi
*inm
;
750 u_int8_t ttl
= IP_DEFAULT_MULTICAST_TTL
;
751 u_int8_t loop
= IP_DEFAULT_MULTICAST_LOOP
;
753 m
->m_flags
|= M_MCAST
;
755 * IP destination address is multicast. Make sure "dst"
756 * still points to the address in "ro". (It may have been
757 * changed to point to a gateway address, above.)
759 dst
= (struct sockaddr_in
*)&ro
->ro_dst
;
761 * See if the caller provided any multicast options
765 vif
= imo
->imo_multicast_vif
;
766 ttl
= imo
->imo_multicast_ttl
;
767 loop
= imo
->imo_multicast_loop
;
768 if ((flags
& IP_RAWOUTPUT
) == 0)
770 if (imo
->imo_multicast_ifp
!= NULL
)
771 ifp
= imo
->imo_multicast_ifp
;
774 if (vif
!= -1 && ((flags
& IP_RAWOUTPUT
) == 0 ||
775 ip
->ip_src
.s_addr
== INADDR_ANY
))
776 ip
->ip_src
.s_addr
= ip_mcast_src(vif
);
777 #endif /* MROUTING */
778 } else if ((flags
& IP_RAWOUTPUT
) == 0) {
783 * Confirm that the outgoing interface supports multicast.
785 if (imo
== NULL
|| vif
== -1) {
786 if ((ifp
->if_flags
& IFF_MULTICAST
) == 0) {
787 OSAddAtomic(1, &ipstat
.ips_noroute
);
793 * If source address not specified yet, use address
794 * of outgoing interface.
796 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
797 struct in_ifaddr
*ia1
;
798 lck_rw_lock_shared(in_ifaddr_rwlock
);
799 TAILQ_FOREACH(ia1
, &in_ifaddrhead
, ia_link
) {
800 IFA_LOCK_SPIN(&ia1
->ia_ifa
);
801 if (ia1
->ia_ifp
== ifp
) {
802 ip
->ip_src
= IA_SIN(ia1
)->sin_addr
;
803 IFA_UNLOCK(&ia1
->ia_ifa
);
806 IFA_UNLOCK(&ia1
->ia_ifa
);
808 lck_rw_done(in_ifaddr_rwlock
);
809 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
815 in_multihead_lock_shared();
816 IN_LOOKUP_MULTI(&pkt_dst
, ifp
, inm
);
817 in_multihead_lock_done();
818 if (inm
!= NULL
&& (imo
== NULL
|| loop
)) {
820 * If we belong to the destination multicast group
821 * on the outgoing interface, and the caller did not
822 * forbid loopback, loop back a copy.
824 if (!TAILQ_EMPTY(&ipv4_filters
)) {
825 struct ipfilter
*filter
;
826 int seen
= (inject_filter_ref
== 0);
829 ipf_pktopts
.ippo_flags
|= IPPOF_MCAST_OPTS
;
830 ipf_pktopts
.ippo_mcast_ifnet
= ifp
;
831 ipf_pktopts
.ippo_mcast_ttl
= ttl
;
832 ipf_pktopts
.ippo_mcast_loop
= loop
;
837 /* 4135317 - always pass network byte order to filter */
839 #if BYTE_ORDER != BIG_ENDIAN
844 TAILQ_FOREACH(filter
, &ipv4_filters
, ipf_link
) {
846 if ((struct ipfilter
*)inject_filter_ref
== filter
)
848 } else if (filter
->ipf_filter
.ipf_output
) {
850 result
= filter
->ipf_filter
.ipf_output(filter
->ipf_filter
.cookie
, (mbuf_t
*)&m
, ippo
);
851 if (result
== EJUSTRETURN
) {
864 /* set back to host byte order */
865 ip
= mtod(m
, struct ip
*);
867 #if BYTE_ORDER != BIG_ENDIAN
875 ip_mloopback(ifp
, m
, dst
, hlen
);
880 * If we are acting as a multicast router, perform
881 * multicast forwarding as if the packet had just
882 * arrived on the interface to which we are about
883 * to send. The multicast forwarding function
884 * recursively calls this function, using the
885 * IP_FORWARDING flag to prevent infinite recursion.
887 * Multicasts that are looped back by ip_mloopback(),
888 * above, will be forwarded by the ip_input() routine,
891 if (ip_mrouter
&& (flags
& IP_FORWARDING
) == 0) {
893 * Check if rsvp daemon is running. If not, don't
894 * set ip_moptions. This ensures that the packet
895 * is multicast and not just sent down one link
896 * as prescribed by rsvpd.
900 if (ip_mforward(ip
, ifp
, m
, imo
) != 0) {
908 #endif /* MROUTING */
912 * Multicasts with a time-to-live of zero may be looped-
913 * back, above, but must not be transmitted on a network.
914 * Also, multicasts addressed to the loopback interface
915 * are not sent -- the above call to ip_mloopback() will
916 * loop back a copy if this host actually belongs to the
917 * destination group on the loopback interface.
919 if (ip
->ip_ttl
== 0 || ifp
->if_flags
& IFF_LOOPBACK
) {
928 * If source address not specified yet, use address
929 * of outgoing interface.
931 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
932 IFA_LOCK_SPIN(&ia
->ia_ifa
);
933 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
934 IFA_UNLOCK(&ia
->ia_ifa
);
935 #if IPFIREWALL_FORWARD
936 /* Keep note that we did this - if the firewall changes
937 * the next-hop, our interface may change, changing the
938 * default source IP. It's a shame so much effort happens
942 #endif /* IPFIREWALL_FORWARD */
947 * Look for broadcast address and
948 * and verify user is allowed to send
952 if ((ifp
->if_flags
& IFF_BROADCAST
) == 0) {
953 error
= EADDRNOTAVAIL
;
956 if ((flags
& IP_ALLOWBROADCAST
) == 0) {
960 /* don't allow broadcast messages to be fragmented */
961 if ((u_short
)ip
->ip_len
> ifp
->if_mtu
) {
965 m
->m_flags
|= M_BCAST
;
967 m
->m_flags
&= ~M_BCAST
;
972 /* Invoke outbound packet filter */
973 if ( PF_IS_ENABLED
) {
975 rc
= pf_af_hook(ifp
, mppn
, &m
, AF_INET
, FALSE
);
977 if (packetlist
== m0
) {
983 /* Next packet in the chain */
985 } else if (packetlist
!= NULL
) {
986 /* No more packet; send down the chain */
989 /* Nothing left; we're done */
993 ip
= mtod(m
, struct ip
*);
994 pkt_dst
= ip
->ip_dst
;
995 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
999 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1001 if (IN_LINKLOCAL(ntohl(ip
->ip_src
.s_addr
)) || IN_LINKLOCAL(ntohl(ip
->ip_dst
.s_addr
))) {
1002 ip_linklocal_stat
.iplls_out_total
++;
1003 if (ip
->ip_ttl
!= MAXTTL
) {
1004 ip_linklocal_stat
.iplls_out_badttl
++;
1005 ip
->ip_ttl
= MAXTTL
;
1009 if (!didfilter
&& !TAILQ_EMPTY(&ipv4_filters
)) {
1010 struct ipfilter
*filter
;
1011 int seen
= (inject_filter_ref
== 0);
1012 ipf_pktopts
.ippo_flags
&= ~IPPOF_MCAST_OPTS
;
1014 /* Check that a TSO frame isn't passed to a filter.
1015 * This could happen if a filter is inserted while
1016 * TCP is sending the TSO packet.
1018 if (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
) {
1025 /* 4135317 - always pass network byte order to filter */
1027 #if BYTE_ORDER != BIG_ENDIAN
1032 TAILQ_FOREACH(filter
, &ipv4_filters
, ipf_link
) {
1034 if ((struct ipfilter
*)inject_filter_ref
== filter
)
1036 } else if (filter
->ipf_filter
.ipf_output
) {
1038 result
= filter
->ipf_filter
.ipf_output(filter
->ipf_filter
.cookie
, (mbuf_t
*)&m
, ippo
);
1039 if (result
== EJUSTRETURN
) {
1050 /* set back to host byte order */
1051 ip
= mtod(m
, struct ip
*);
1053 #if BYTE_ORDER != BIG_ENDIAN
1062 /* temporary for testing only: bypass ipsec alltogether */
1064 if (ipsec_bypass
!= 0 || (flags
& IP_NOIPSEC
) != 0)
1067 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_START
, 0,0,0,0,0);
1070 /* get SP for this packet */
1072 sp
= ipsec4_getpolicybyaddr(m
, IPSEC_DIR_OUTBOUND
, flags
, &error
);
1074 sp
= ipsec4_getpolicybysock(m
, IPSEC_DIR_OUTBOUND
, so
, &error
);
1077 IPSEC_STAT_INCREMENT(ipsecstat
.out_inval
);
1078 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
, 0,0,0,0,0);
1085 switch (sp
->policy
) {
1086 case IPSEC_POLICY_DISCARD
:
1087 case IPSEC_POLICY_GENERATE
:
1089 * This packet is just discarded.
1091 IPSEC_STAT_INCREMENT(ipsecstat
.out_polvio
);
1092 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
, 1,0,0,0,0);
1095 case IPSEC_POLICY_BYPASS
:
1096 case IPSEC_POLICY_NONE
:
1097 /* no need to do IPsec. */
1098 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
, 2,0,0,0,0);
1101 case IPSEC_POLICY_IPSEC
:
1102 if (sp
->req
== NULL
) {
1103 /* acquire a policy */
1104 error
= key_spdacquire(sp
);
1105 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
, 3,0,0,0,0);
1110 case IPSEC_POLICY_ENTRUST
:
1112 printf("ip_output: Invalid policy found. %d\n", sp
->policy
);
1115 struct ipsec_output_state state
;
1116 bzero(&state
, sizeof(state
));
1118 if (flags
& IP_ROUTETOIF
) {
1119 state
.ro
= &iproute
;
1120 bzero(&iproute
, sizeof(iproute
));
1123 state
.dst
= (struct sockaddr
*)dst
;
1129 * delayed checksums are not currently compatible with IPsec
1131 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
1132 in_delayed_cksum(m
);
1133 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1137 #if BYTE_ORDER != BIG_ENDIAN
1142 DTRACE_IP6(send
, struct mbuf
*, m
, struct inpcb
*, NULL
,
1143 struct ip
*, ip
, struct ifnet
*, ifp
,
1144 struct ip
*, ip
, struct ip6_hdr
*, NULL
);
1146 error
= ipsec4_output(&state
, sp
, flags
);
1150 if (flags
& IP_ROUTETOIF
) {
1152 * if we have tunnel mode SA, we may need to ignore
1155 if (state
.ro
!= &iproute
|| state
.ro
->ro_rt
!= NULL
) {
1156 flags
&= ~IP_ROUTETOIF
;
1162 dst
= (struct sockaddr_in
*)state
.dst
;
1164 /* mbuf is already reclaimed in ipsec4_output. */
1174 printf("ip4_output (ipsec): error code %d\n", error
);
1177 /* don't show these error codes to the user */
1181 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
, 4,0,0,0,0);
1186 /* be sure to update variables that are affected by ipsec4_output() */
1187 ip
= mtod(m
, struct ip
*);
1190 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
1192 hlen
= ip
->ip_hl
<< 2;
1194 /* Check that there wasn't a route change and src is still valid */
1195 if (ro
->ro_rt
!= NULL
&& ro
->ro_rt
->generation_id
!= route_generation
) {
1196 if ((src_ia
= ifa_foraddr(ip
->ip_src
.s_addr
)) == NULL
&&
1197 ((flags
& (IP_ROUTETOIF
| IP_FORWARDING
)) == 0)) {
1198 error
= EADDRNOTAVAIL
;
1199 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1206 IFA_REMREF(&src_ia
->ia_ifa
);
1209 if (ro
->ro_rt
== NULL
) {
1210 if ((flags
& IP_ROUTETOIF
) == 0) {
1211 printf("ip_output: can't update route after "
1212 "IPsec processing\n");
1213 error
= EHOSTUNREACH
; /*XXX*/
1214 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1220 IFA_REMREF(&ia
->ia_ifa
);
1221 RT_LOCK_SPIN(ro
->ro_rt
);
1222 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
1224 /* Become a regular mutex */
1225 RT_CONVERT_LOCK(ro
->ro_rt
);
1226 IFA_ADDREF(&ia
->ia_ifa
);
1228 ifp
= ro
->ro_rt
->rt_ifp
;
1229 RT_UNLOCK(ro
->ro_rt
);
1232 /* make it flipped, again. */
1234 #if BYTE_ORDER != BIG_ENDIAN
1239 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
, 7,0xff,0xff,0xff,0xff);
1241 /* Pass to filters again */
1242 if (!TAILQ_EMPTY(&ipv4_filters
)) {
1243 struct ipfilter
*filter
;
1245 ipf_pktopts
.ippo_flags
&= ~IPPOF_MCAST_OPTS
;
1247 /* Check that a TSO frame isn't passed to a filter.
1248 * This could happen if a filter is inserted while
1249 * TCP is sending the TSO packet.
1251 if (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
) {
1258 /* 4135317 - always pass network byte order to filter */
1260 #if BYTE_ORDER != BIG_ENDIAN
1265 TAILQ_FOREACH(filter
, &ipv4_filters
, ipf_link
) {
1266 if (filter
->ipf_filter
.ipf_output
) {
1268 result
= filter
->ipf_filter
.ipf_output(filter
->ipf_filter
.cookie
, (mbuf_t
*)&m
, ippo
);
1269 if (result
== EJUSTRETURN
) {
1280 /* set back to host byte order */
1281 ip
= mtod(m
, struct ip
*);
1283 #if BYTE_ORDER != BIG_ENDIAN
1296 * - Xlate: translate packet's addr/port (NAT).
1297 * - Firewall: deny/allow/etc.
1298 * - Wrap: fake packet's addr/port <unimpl.>
1299 * - Encapsulate: put it in another IP and send out. <unimp.>
1302 struct mbuf
*m1
= m
;
1304 if ((error
= (*fr_checkp
)(ip
, hlen
, ifp
, 1, &m1
)) || !m1
) {
1307 ip
= mtod(m0
= m
= m1
, struct ip
*);
1311 * Check with the firewall...
1312 * but not if we are already being fwd'd from a firewall.
1314 if (fw_enable
&& IPFW_LOADED
&& !args
.next_hop
) {
1315 struct sockaddr_in
*old
= dst
;
1318 args
.next_hop
= dst
;
1320 off
= ip_fw_chk_ptr(&args
);
1322 dst
= args
.next_hop
;
1325 * On return we must do the following:
1326 * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new)
1327 * 1<=off<= 0xffff -> DIVERT
1328 * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1329 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet
1330 * dst != old -> IPFIREWALL_FORWARD
1331 * off==0, dst==old -> accept
1332 * If some of the above modules is not compiled in, then
1333 * we should't have to check the corresponding condition
1334 * (because the ipfw control socket should not accept
1335 * unsupported rules), but better play safe and drop
1336 * packets in case of doubt.
1339 if ( (off
& IP_FW_PORT_DENY_FLAG
) || m
== NULL
) {
1345 ip
= mtod(m
, struct ip
*);
1347 if (off
== 0 && dst
== old
) {/* common case */
1351 if (DUMMYNET_LOADED
&& (off
& IP_FW_PORT_DYNT_FLAG
) != 0) {
1353 * pass the pkt to dummynet. Need to include
1354 * pipe number, m, ifp, ro, dst because these are
1355 * not recomputed in the next pass.
1356 * All other parameters have been already used and
1357 * so they are not needed anymore.
1358 * XXX note: if the ifp or ro entry are deleted
1359 * while a pkt is in dummynet, we are in trouble!
1364 if (flags
& IP_OUTARGS
)
1367 error
= ip_dn_io_ptr(m
, off
& 0xffff, DN_TO_IP_OUT
,
1371 #endif /* DUMMYNET */
1373 if (off
!= 0 && (off
& IP_FW_PORT_DYNT_FLAG
) == 0) {
1374 struct mbuf
*clone
= NULL
;
1376 /* Clone packet if we're doing a 'tee' */
1377 if ((off
& IP_FW_PORT_TEE_FLAG
) != 0)
1378 clone
= m_dup(m
, M_DONTWAIT
);
1381 * delayed checksums are not currently compatible
1382 * with divert sockets.
1384 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
1385 in_delayed_cksum(m
);
1386 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1389 /* Restore packet header fields to original values */
1391 #if BYTE_ORDER != BIG_ENDIAN
1396 /* Deliver packet to divert input routine */
1397 divert_packet(m
, 0, off
& 0xffff, args
.divert_rule
);
1399 /* If 'tee', continue with original packet */
1400 if (clone
!= NULL
) {
1402 ip
= mtod(m
, struct ip
*);
1409 #if IPFIREWALL_FORWARD
1410 /* Here we check dst to make sure it's directly reachable on the
1411 * interface we previously thought it was.
1412 * If it isn't (which may be likely in some situations) we have
1413 * to re-route it (ie, find a route for the next-hop and the
1414 * associated interface) and set them here. This is nested
1415 * forwarding which in most cases is undesirable, except where
1416 * such control is nigh impossible. So we do it here.
1419 if (off
== 0 && old
!= dst
) {
1420 struct in_ifaddr
*ia_fw
;
1422 /* It's changed... */
1423 /* There must be a better way to do this next line... */
1424 static struct route sro_fwd
, *ro_fwd
= &sro_fwd
;
1425 #if IPFIREWALL_FORWARD_DEBUG
1426 printf("IPFIREWALL_FORWARD: New dst ip: ");
1427 print_ip(dst
->sin_addr
);
1431 * We need to figure out if we have been forwarded
1432 * to a local socket. If so then we should somehow
1433 * "loop back" to ip_input, and get directed to the
1434 * PCB as if we had received this packet. This is
1435 * because it may be dificult to identify the packets
1436 * you want to forward until they are being output
1437 * and have selected an interface. (e.g. locally
1438 * initiated packets) If we used the loopback inteface,
1439 * we would not be able to control what happens
1440 * as the packet runs through ip_input() as
1441 * it is done through a ISR.
1443 lck_rw_lock_shared(in_ifaddr_rwlock
);
1444 TAILQ_FOREACH(ia_fw
, &in_ifaddrhead
, ia_link
) {
1446 * If the addr to forward to is one
1447 * of ours, we pretend to
1448 * be the destination for this packet.
1450 IFA_LOCK_SPIN(&ia_fw
->ia_ifa
);
1451 if (IA_SIN(ia_fw
)->sin_addr
.s_addr
==
1452 dst
->sin_addr
.s_addr
) {
1453 IFA_UNLOCK(&ia_fw
->ia_ifa
);
1456 IFA_UNLOCK(&ia_fw
->ia_ifa
);
1458 lck_rw_done(in_ifaddr_rwlock
);
1460 /* tell ip_input "dont filter" */
1461 struct m_tag
*fwd_tag
;
1462 struct ip_fwd_tag
*ipfwd_tag
;
1464 fwd_tag
= m_tag_create(KERNEL_MODULE_TAG_ID
,
1465 KERNEL_TAG_TYPE_IPFORWARD
,
1466 sizeof (*ipfwd_tag
), M_NOWAIT
, m
);
1467 if (fwd_tag
== NULL
) {
1472 ipfwd_tag
= (struct ip_fwd_tag
*)(fwd_tag
+1);
1473 ipfwd_tag
->next_hop
= args
.next_hop
;
1475 m_tag_prepend(m
, fwd_tag
);
1477 if (m
->m_pkthdr
.rcvif
== NULL
)
1478 m
->m_pkthdr
.rcvif
= lo_ifp
;
1479 if ((~IF_HWASSIST_CSUM_FLAGS(m
->m_pkthdr
.rcvif
->if_hwassist
) &
1480 m
->m_pkthdr
.csum_flags
) == 0) {
1481 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
1482 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1483 m
->m_pkthdr
.csum_flags
|=
1484 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
1485 m
->m_pkthdr
.csum_data
= 0xffff;
1487 m
->m_pkthdr
.csum_flags
|=
1488 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
1490 else if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
1491 in_delayed_cksum(m
);
1492 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1493 ip
->ip_sum
= in_cksum(m
, hlen
);
1496 #if BYTE_ORDER != BIG_ENDIAN
1501 /* we need to call dlil_output to run filters
1502 * and resync to avoid recursion loops.
1505 dlil_output(lo_ifp
, PF_INET
, m
, 0, (struct sockaddr
*)dst
, 0);
1508 printf("ip_output: no loopback ifp for forwarding!!!\n");
1512 /* Some of the logic for this was
1513 * nicked from above.
1515 * This rewrites the cached route in a local PCB.
1516 * Is this what we want to do?
1518 bcopy(dst
, &ro_fwd
->ro_dst
, sizeof(*dst
));
1520 ro_fwd
->ro_rt
= NULL
;
1521 rtalloc_ign(ro_fwd
, RTF_PRCLONING
);
1523 if (ro_fwd
->ro_rt
== NULL
) {
1524 OSAddAtomic(1, &ipstat
.ips_noroute
);
1525 error
= EHOSTUNREACH
;
1529 RT_LOCK_SPIN(ro_fwd
->ro_rt
);
1530 ia_fw
= ifatoia(ro_fwd
->ro_rt
->rt_ifa
);
1531 if (ia_fw
!= NULL
) {
1532 /* Become a regular mutex */
1533 RT_CONVERT_LOCK(ro_fwd
->ro_rt
);
1534 IFA_ADDREF(&ia_fw
->ia_ifa
);
1536 ifp
= ro_fwd
->ro_rt
->rt_ifp
;
1537 ro_fwd
->ro_rt
->rt_use
++;
1538 if (ro_fwd
->ro_rt
->rt_flags
& RTF_GATEWAY
)
1539 dst
= (struct sockaddr_in
*)ro_fwd
->ro_rt
->rt_gateway
;
1540 if (ro_fwd
->ro_rt
->rt_flags
& RTF_HOST
) {
1542 (ro_fwd
->ro_rt
->rt_flags
& RTF_BROADCAST
);
1544 /* Become a regular mutex */
1545 RT_CONVERT_LOCK(ro_fwd
->ro_rt
);
1546 isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
1548 RT_UNLOCK(ro_fwd
->ro_rt
);
1550 ro
->ro_rt
= ro_fwd
->ro_rt
;
1551 dst
= (struct sockaddr_in
*)&ro_fwd
->ro_dst
;
1554 * If we added a default src ip earlier,
1555 * which would have been gotten from the-then
1556 * interface, do it again, from the new one.
1558 if (ia_fw
!= NULL
) {
1559 if (fwd_rewrite_src
) {
1560 IFA_LOCK_SPIN(&ia_fw
->ia_ifa
);
1561 ip
->ip_src
= IA_SIN(ia_fw
)->sin_addr
;
1562 IFA_UNLOCK(&ia_fw
->ia_ifa
);
1564 IFA_REMREF(&ia_fw
->ia_ifa
);
1568 #endif /* IPFIREWALL_FORWARD */
1570 * if we get here, none of the above matches, and
1571 * we have to drop the pkt
1574 error
= EACCES
; /* not sure this is the right error msg */
1579 #endif /* IPFIREWALL */
1581 /* Do not allow loopback address to wind up on a wire */
1582 if ((ifp
->if_flags
& IFF_LOOPBACK
) == 0 &&
1583 ((ntohl(ip
->ip_src
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
||
1584 (ntohl(ip
->ip_dst
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
)) {
1585 OSAddAtomic(1, &ipstat
.ips_badaddr
);
1588 * Do not simply drop the packet just like a firewall -- we want the
1589 * the application to feel the pain.
1590 * Return ENETUNREACH like ip6_output does in some similar cases.
1591 * This can startle the otherwise clueless process that specifies
1592 * loopback as the source address.
1594 error
= ENETUNREACH
;
1598 m
->m_pkthdr
.csum_flags
|= CSUM_IP
;
1599 tso
= (ifp
->if_hwassist
& IFNET_TSO_IPV4
) && (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
);
1601 sw_csum
= m
->m_pkthdr
.csum_flags
1602 & ~IF_HWASSIST_CSUM_FLAGS(ifp
->if_hwassist
);
1604 if ((ifp
->if_hwassist
& CSUM_TCP_SUM16
) != 0) {
1606 * Special case code for GMACE
1607 * frames that can be checksumed by GMACE SUM16 HW:
1608 * frame >64, no fragments, no UDP
1610 if (apple_hwcksum_tx
&& (m
->m_pkthdr
.csum_flags
& CSUM_TCP
)
1611 && (ip
->ip_len
> 50) && (ip
->ip_len
<= ifp
->if_mtu
)) {
1612 /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */
1613 u_short offset
= (IP_VHL_HL(ip
->ip_vhl
) << 2) +14 ; /* IP+Enet header length */
1614 u_short csumprev
= m
->m_pkthdr
.csum_data
& 0xFFFF;
1615 m
->m_pkthdr
.csum_flags
= CSUM_DATA_VALID
| CSUM_TCP_SUM16
; /* for GMAC */
1616 m
->m_pkthdr
.csum_data
= (csumprev
+ offset
) << 16 ;
1617 m
->m_pkthdr
.csum_data
+= offset
;
1618 sw_csum
= CSUM_DELAY_IP
; /* do IP hdr chksum in software */
1621 /* let the software handle any UDP or TCP checksums */
1622 sw_csum
|= (CSUM_DELAY_DATA
& m
->m_pkthdr
.csum_flags
);
1624 } else if (apple_hwcksum_tx
== 0) {
1625 sw_csum
|= (CSUM_DELAY_DATA
| CSUM_DELAY_IP
) &
1626 m
->m_pkthdr
.csum_flags
;
1629 if (sw_csum
& CSUM_DELAY_DATA
) {
1630 in_delayed_cksum(m
);
1631 sw_csum
&= ~CSUM_DELAY_DATA
;
1632 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1635 if (apple_hwcksum_tx
!= 0) {
1636 m
->m_pkthdr
.csum_flags
&=
1637 IF_HWASSIST_CSUM_FLAGS(ifp
->if_hwassist
);
1639 m
->m_pkthdr
.csum_flags
= 0;
1643 * If small enough for interface, or the interface will take
1644 * care of the fragmentation for us, can just send directly.
1646 if ((u_short
)ip
->ip_len
<= ifp
->if_mtu
|| tso
||
1647 ifp
->if_hwassist
& CSUM_FRAGMENT
) {
1649 m
->m_pkthdr
.csum_flags
|= CSUM_TSO_IPV4
;
1652 #if BYTE_ORDER != BIG_ENDIAN
1658 if (sw_csum
& CSUM_DELAY_IP
) {
1659 ip
->ip_sum
= in_cksum(m
, hlen
);
1663 /* Record statistics for this interface address. */
1664 if (!(flags
& IP_FORWARDING
) && ia
!= NULL
) {
1665 ia
->ia_ifa
.if_opackets
++;
1666 ia
->ia_ifa
.if_obytes
+= m
->m_pkthdr
.len
;
1671 /* clean ipsec history once it goes out of the node */
1672 if (ipsec_bypass
== 0 && (flags
& IP_NOIPSEC
) == 0)
1675 if (packetchain
== 0) {
1676 if (ro
->ro_rt
&& nstat_collect
)
1677 nstat_route_tx(ro
->ro_rt
, 1, m
->m_pkthdr
.len
, 0);
1678 error
= ifnet_output(ifp
, PF_INET
, m
, ro
->ro_rt
,
1679 (struct sockaddr
*)dst
);
1682 else { /* packet chaining allows us to reuse the route for all packets */
1683 bytecnt
+= m
->m_pkthdr
.len
;
1684 mppn
= &m
->m_nextpkt
;
1690 if (pktcnt
> ip_maxchainsent
)
1691 ip_maxchainsent
= pktcnt
;
1692 if (ro
->ro_rt
&& nstat_collect
)
1693 nstat_route_tx(ro
->ro_rt
, pktcnt
, bytecnt
, 0);
1695 error
= ifnet_output(ifp
, PF_INET
, packetlist
,
1696 ro
->ro_rt
, (struct sockaddr
*)dst
);
1708 * Too large for interface; fragment if possible.
1709 * Must be able to put at least 8 bytes per fragment.
1712 if (ip
->ip_off
& IP_DF
|| (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
) ||
1716 * This case can happen if the user changed the MTU
1717 * of an interface after enabling IP on it. Because
1718 * most netifs don't keep track of routes pointing to
1719 * them, there is no way for one to update all its
1720 * routes when the MTU is changed.
1723 RT_LOCK_SPIN(ro
->ro_rt
);
1724 if ((ro
->ro_rt
->rt_flags
& (RTF_UP
| RTF_HOST
))
1725 && !(ro
->ro_rt
->rt_rmx
.rmx_locks
& RTV_MTU
)
1726 && (ro
->ro_rt
->rt_rmx
.rmx_mtu
> ifp
->if_mtu
)) {
1727 ro
->ro_rt
->rt_rmx
.rmx_mtu
= ifp
->if_mtu
;
1729 RT_UNLOCK(ro
->ro_rt
);
1734 OSAddAtomic(1, &ipstat
.ips_cantfrag
);
1738 error
= ip_fragment(m
, ifp
, ifp
->if_mtu
, sw_csum
);
1744 KERNEL_DEBUG(DBG_LAYER_END
, ip
->ip_dst
.s_addr
,
1745 ip
->ip_src
.s_addr
, ip
->ip_p
, ip
->ip_off
, ip
->ip_len
);
1747 for (m
= m0
; m
; m
= m0
) {
1751 /* clean ipsec history once it goes out of the node */
1752 if (ipsec_bypass
== 0 && (flags
& IP_NOIPSEC
) == 0)
1757 /* Record statistics for this interface address. */
1759 ia
->ia_ifa
.if_opackets
++;
1760 ia
->ia_ifa
.if_obytes
+= m
->m_pkthdr
.len
;
1763 if ((packetchain
!= 0) && (pktcnt
> 0))
1764 panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist
);
1765 if (ro
->ro_rt
&& nstat_collect
)
1766 nstat_route_tx(ro
->ro_rt
, 1, m
->m_pkthdr
.len
, 0);
1767 error
= ifnet_output(ifp
, PF_INET
, m
, ro
->ro_rt
,
1768 (struct sockaddr
*)dst
);
1774 OSAddAtomic(1, &ipstat
.ips_fragmented
);
1778 IFA_REMREF(&ia
->ia_ifa
);
1782 if (ipsec_bypass
== 0 && (flags
& IP_NOIPSEC
) == 0) {
1783 if (ro
== &iproute
&& ro
->ro_rt
) {
1788 KEYDEBUG(KEYDEBUG_IPSEC_STAMP
,
1789 printf("DP ip_output call free SP:%x\n", sp
));
1790 key_freesp(sp
, KEY_SADB_UNLOCKED
);
1795 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT
| DBG_FUNC_END
, error
,0,0,0,0);
1803 ip_fragment(struct mbuf
*m
, struct ifnet
*ifp
, unsigned long mtu
, int sw_csum
)
1805 struct ip
*ip
, *mhip
;
1806 int len
, hlen
, mhlen
, firstlen
, off
, error
= 0;
1807 struct mbuf
**mnext
= &m
->m_nextpkt
, *m0
;
1810 ip
= mtod(m
, struct ip
*);
1812 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
1814 hlen
= ip
->ip_hl
<< 2;
1817 firstlen
= len
= (mtu
- hlen
) &~ 7;
1824 * if the interface will not calculate checksums on
1825 * fragmented packets, then do it here.
1827 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
&&
1828 (ifp
->if_hwassist
& CSUM_IP_FRAGS
) == 0) {
1829 in_delayed_cksum(m
);
1830 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_DATA
;
1834 * Loop through length of segment after first fragment,
1835 * make new header and copy data of each part and link onto chain.
1838 mhlen
= sizeof (struct ip
);
1839 for (off
= hlen
+ len
; off
< (u_short
)ip
->ip_len
; off
+= len
) {
1840 MGETHDR(m
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
1843 OSAddAtomic(1, &ipstat
.ips_odropped
);
1846 m
->m_flags
|= (m0
->m_flags
& M_MCAST
) | M_FRAG
;
1847 m
->m_data
+= max_linkhdr
;
1848 mhip
= mtod(m
, struct ip
*);
1850 if (hlen
> sizeof (struct ip
)) {
1851 mhlen
= ip_optcopy(ip
, mhip
) + sizeof (struct ip
);
1852 mhip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, mhlen
>> 2);
1855 mhip
->ip_off
= ((off
- hlen
) >> 3) + (ip
->ip_off
& ~IP_MF
);
1856 if (ip
->ip_off
& IP_MF
)
1857 mhip
->ip_off
|= IP_MF
;
1858 if (off
+ len
>= (u_short
)ip
->ip_len
)
1859 len
= (u_short
)ip
->ip_len
- off
;
1861 mhip
->ip_off
|= IP_MF
;
1862 mhip
->ip_len
= htons((u_short
)(len
+ mhlen
));
1863 m
->m_next
= m_copy(m0
, off
, len
);
1864 if (m
->m_next
== 0) {
1866 error
= ENOBUFS
; /* ??? */
1867 OSAddAtomic(1, &ipstat
.ips_odropped
);
1870 m
->m_pkthdr
.len
= mhlen
+ len
;
1871 m
->m_pkthdr
.rcvif
= 0;
1872 m
->m_pkthdr
.csum_flags
= m0
->m_pkthdr
.csum_flags
;
1873 m
->m_pkthdr
.socket_id
= m0
->m_pkthdr
.socket_id
;
1875 mac_netinet_fragment(m0
, m
);
1878 #if BYTE_ORDER != BIG_ENDIAN
1879 HTONS(mhip
->ip_off
);
1883 if (sw_csum
& CSUM_DELAY_IP
) {
1884 mhip
->ip_sum
= in_cksum(m
, mhlen
);
1887 mnext
= &m
->m_nextpkt
;
1890 OSAddAtomic(nfrags
, &ipstat
.ips_ofragments
);
1892 /* set first/last markers for fragment chain */
1893 m
->m_flags
|= M_LASTFRAG
;
1894 m0
->m_flags
|= M_FIRSTFRAG
| M_FRAG
;
1895 m0
->m_pkthdr
.csum_data
= nfrags
;
1898 * Update first fragment by trimming what's been copied out
1899 * and updating header, then send each fragment (in order).
1902 m_adj(m
, hlen
+ firstlen
- (u_short
)ip
->ip_len
);
1903 m
->m_pkthdr
.len
= hlen
+ firstlen
;
1904 ip
->ip_len
= htons((u_short
)m
->m_pkthdr
.len
);
1905 ip
->ip_off
|= IP_MF
;
1907 #if BYTE_ORDER != BIG_ENDIAN
1912 if (sw_csum
& CSUM_DELAY_IP
) {
1913 ip
->ip_sum
= in_cksum(m
, hlen
);
1923 ip_out_cksum_stats(int proto
, u_int32_t len
)
1927 tcp_out_cksum_stats(len
);
1930 udp_out_cksum_stats(len
);
1933 /* keep only TCP or UDP stats for now */
1939 in_delayed_cksum_offset(struct mbuf
*m0
, int ip_offset
)
1942 unsigned char buf
[sizeof(struct ip
)];
1943 u_short csum
, offset
, ip_len
;
1945 /* Save copy of first mbuf pointer and the ip_offset before modifying */
1946 struct mbuf
*m
= m0
;
1947 int ip_offset_copy
= ip_offset
;
1949 while (ip_offset
>= m
->m_len
) {
1950 ip_offset
-= m
->m_len
;
1953 printf("in_delayed_cksum_withoffset failed - ip_offset wasn't in the packet\n");
1958 /* Sometimes the IP header is not contiguous, yes this can happen! */
1959 if (ip_offset
+ sizeof(struct ip
) > m
->m_len
) {
1961 printf("delayed m_pullup, m->len: %d off: %d\n",
1962 m
->m_len
, ip_offset
);
1964 m_copydata(m
, ip_offset
, sizeof(struct ip
), (caddr_t
) buf
);
1966 ip
= (struct ip
*)buf
;
1968 ip
= (struct ip
*)(m
->m_data
+ ip_offset
);
1973 m
->m_len
-= ip_offset
;
1974 m
->m_data
+= ip_offset
;
1977 offset
= IP_VHL_HL(ip
->ip_vhl
) << 2 ;
1980 * We could be in the context of an IP or interface filter; in the
1981 * former case, ip_len would be in host (correct) order while for
1982 * the latter it would be in network order. Because of this, we
1983 * attempt to interpret the length field by comparing it against
1984 * the actual packet length. If the comparison fails, byte swap
1985 * the length and check again. If it still fails, then the packet
1986 * is bogus and we give up.
1988 ip_len
= ip
->ip_len
;
1989 if (ip_len
!= (m0
->m_pkthdr
.len
- ip_offset_copy
)) {
1990 ip_len
= SWAP16(ip_len
);
1991 if (ip_len
!= (m0
->m_pkthdr
.len
- ip_offset_copy
)) {
1992 printf("in_delayed_cksum_offset: ip_len %d (%d) "
1993 "doesn't match actual length %d\n", ip
->ip_len
,
1994 ip_len
, (m0
->m_pkthdr
.len
- ip_offset_copy
));
1999 csum
= in_cksum_skip(m
, ip_len
, offset
);
2002 ip_out_cksum_stats(ip
->ip_p
, ip_len
- offset
);
2004 if (m0
->m_pkthdr
.csum_flags
& CSUM_UDP
&& csum
== 0)
2006 offset
+= m0
->m_pkthdr
.csum_data
& 0xFFFF; /* checksum offset */
2010 if (M_LEADINGSPACE(m
) < ip_offset
)
2011 panic("in_delayed_cksum_offset - chain modified!\n");
2012 m
->m_len
+= ip_offset
;
2013 m
->m_data
-= ip_offset
;
2016 if (offset
> ip_len
) /* bogus offset */
2019 /* Insert the checksum in the existing chain */
2020 if (offset
+ ip_offset
+ sizeof(u_short
) > m
->m_len
) {
2024 printf("delayed m_copyback, m->len: %d off: %d p: %d\n",
2025 m
->m_len
, offset
+ ip_offset
, ip
->ip_p
);
2027 *(u_short
*)tmp
= csum
;
2028 m_copyback(m
, offset
+ ip_offset
, 2, tmp
);
2030 *(u_short
*)(m
->m_data
+ offset
+ ip_offset
) = csum
;
2034 in_delayed_cksum(struct mbuf
*m
)
2036 in_delayed_cksum_offset(m
, 0);
2040 in_cksum_offset(struct mbuf
* m
, size_t ip_offset
)
2042 struct ip
* ip
= NULL
;
2044 unsigned char buf
[sizeof(struct ip
)];
2047 /* Save copy of first mbuf pointer and the ip_offset before modifying */
2048 struct mbuf
* m0
= m
;
2049 size_t ip_offset_copy
= ip_offset
;
2051 while (ip_offset
>= m
->m_len
) {
2052 ip_offset
-= m
->m_len
;
2055 printf("in_cksum_offset failed - ip_offset wasn't in the packet\n");
2060 /* Sometimes the IP header is not contiguous, yes this can happen! */
2061 if (ip_offset
+ sizeof(struct ip
) > m
->m_len
) {
2064 printf("in_cksum_offset - delayed m_pullup, m->len: %d off: %lu\n",
2065 m
->m_len
, ip_offset
);
2067 m_copydata(m
, ip_offset
, sizeof(struct ip
), (caddr_t
) buf
);
2069 ip
= (struct ip
*)buf
;
2071 m_copyback(m
, ip_offset
+ offsetof(struct ip
, ip_sum
), 2, (caddr_t
)&ip
->ip_sum
);
2073 ip
= (struct ip
*)(m
->m_data
+ ip_offset
);
2079 m
->m_len
-= ip_offset
;
2080 m
->m_data
+= ip_offset
;
2084 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
2086 hlen
= ip
->ip_hl
<< 2;
2089 * We could be in the context of an IP or interface filter; in the
2090 * former case, ip_len would be in host order while for the latter
2091 * it would be in network (correct) order. Because of this, we
2092 * attempt to interpret the length field by comparing it against
2093 * the actual packet length. If the comparison fails, byte swap
2094 * the length and check again. If it still fails, then the packet
2095 * is bogus and we give up.
2097 if (ntohs(ip
->ip_len
) != (m0
->m_pkthdr
.len
- ip_offset_copy
)) {
2098 ip
->ip_len
= SWAP16(ip
->ip_len
);
2100 if (ntohs(ip
->ip_len
) != (m0
->m_pkthdr
.len
- ip_offset_copy
)) {
2101 ip
->ip_len
= SWAP16(ip
->ip_len
);
2102 printf("in_cksum_offset: ip_len %d (%d) "
2103 "doesn't match actual length %lu\n",
2104 ip
->ip_len
, SWAP16(ip
->ip_len
),
2105 (m0
->m_pkthdr
.len
- ip_offset_copy
));
2111 ip
->ip_sum
= in_cksum(m
, hlen
);
2113 ip
->ip_len
= SWAP16(ip
->ip_len
);
2117 if (M_LEADINGSPACE(m
) < ip_offset
)
2118 panic("in_cksum_offset - chain modified!\n");
2119 m
->m_len
+= ip_offset
;
2120 m
->m_data
-= ip_offset
;
2123 /* Insert the checksum in the existing chain if IP header not contiguous */
2124 if (ip_offset
+ sizeof(struct ip
) > m
->m_len
) {
2128 printf("in_cksum_offset m_copyback, m->len: %u off: %lu p: %d\n",
2129 m
->m_len
, ip_offset
+ offsetof(struct ip
, ip_sum
), ip
->ip_p
);
2131 *(u_short
*)tmp
= ip
->ip_sum
;
2132 m_copyback(m
, ip_offset
+ offsetof(struct ip
, ip_sum
), 2, tmp
);
2137 * Insert IP options into preformed packet.
2138 * Adjust IP destination as required for IP source routing,
2139 * as indicated by a non-zero in_addr at the start of the options.
2141 * XXX This routine assumes that the packet has no options in place.
2143 static struct mbuf
*
2144 ip_insertoptions(m
, opt
, phlen
)
2145 register struct mbuf
*m
;
2149 register struct ipoption
*p
= mtod(opt
, struct ipoption
*);
2151 register struct ip
*ip
= mtod(m
, struct ip
*);
2154 optlen
= opt
->m_len
- sizeof(p
->ipopt_dst
);
2155 if (optlen
+ (u_short
)ip
->ip_len
> IP_MAXPACKET
)
2156 return (m
); /* XXX should fail */
2157 if (p
->ipopt_dst
.s_addr
)
2158 ip
->ip_dst
= p
->ipopt_dst
;
2159 if (m
->m_flags
& M_EXT
|| m
->m_data
- optlen
< m
->m_pktdat
) {
2160 MGETHDR(n
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
2163 n
->m_pkthdr
.rcvif
= 0;
2165 mac_mbuf_label_copy(m
, n
);
2167 n
->m_pkthdr
.len
= m
->m_pkthdr
.len
+ optlen
;
2168 m
->m_len
-= sizeof(struct ip
);
2169 m
->m_data
+= sizeof(struct ip
);
2172 m
->m_len
= optlen
+ sizeof(struct ip
);
2173 m
->m_data
+= max_linkhdr
;
2174 (void)memcpy(mtod(m
, void *), ip
, sizeof(struct ip
));
2176 m
->m_data
-= optlen
;
2178 m
->m_pkthdr
.len
+= optlen
;
2179 ovbcopy((caddr_t
)ip
, mtod(m
, caddr_t
), sizeof(struct ip
));
2181 ip
= mtod(m
, struct ip
*);
2182 bcopy(p
->ipopt_list
, ip
+ 1, optlen
);
2183 *phlen
= sizeof(struct ip
) + optlen
;
2184 ip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, *phlen
>> 2);
2185 ip
->ip_len
+= optlen
;
2190 * Copy options from ip to jp,
2191 * omitting those not copied during fragmentation.
2197 register u_char
*cp
, *dp
;
2198 int opt
, optlen
, cnt
;
2200 cp
= (u_char
*)(ip
+ 1);
2201 dp
= (u_char
*)(jp
+ 1);
2202 cnt
= (IP_VHL_HL(ip
->ip_vhl
) << 2) - sizeof (struct ip
);
2203 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
2205 if (opt
== IPOPT_EOL
)
2207 if (opt
== IPOPT_NOP
) {
2208 /* Preserve for IP mcast tunnel's LSRR alignment. */
2214 if (cnt
< IPOPT_OLEN
+ sizeof(*cp
))
2215 panic("malformed IPv4 option passed to ip_optcopy");
2217 optlen
= cp
[IPOPT_OLEN
];
2219 if (optlen
< IPOPT_OLEN
+ sizeof(*cp
) || optlen
> cnt
)
2220 panic("malformed IPv4 option passed to ip_optcopy");
2222 /* bogus lengths should have been caught by ip_dooptions */
2225 if (IPOPT_COPIED(opt
)) {
2226 bcopy(cp
, dp
, optlen
);
2230 for (optlen
= dp
- (u_char
*)(jp
+1); optlen
& 0x3; optlen
++)
2236 * IP socket option processing.
2239 ip_ctloutput(so
, sopt
)
2241 struct sockopt
*sopt
;
2243 struct inpcb
*inp
= sotoinpcb(so
);
2247 if (sopt
->sopt_level
!= IPPROTO_IP
) {
2251 switch (sopt
->sopt_dir
) {
2253 switch (sopt
->sopt_name
) {
2260 if (sopt
->sopt_valsize
> MLEN
) {
2264 MGET(m
, sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
,
2270 m
->m_len
= sopt
->sopt_valsize
;
2271 error
= sooptcopyin(sopt
, mtod(m
, char *), m
->m_len
,
2276 return (ip_pcbopts(sopt
->sopt_name
, &inp
->inp_options
,
2283 case IP_RECVRETOPTS
:
2284 case IP_RECVDSTADDR
:
2287 #if defined(NFAITH) && NFAITH > 0
2290 case IP_RECVPKTINFO
:
2291 error
= sooptcopyin(sopt
, &optval
, sizeof optval
,
2296 switch (sopt
->sopt_name
) {
2298 inp
->inp_ip_tos
= optval
;
2302 inp
->inp_ip_ttl
= optval
;
2304 #define OPTSET(bit) \
2306 inp->inp_flags |= bit; \
2308 inp->inp_flags &= ~bit;
2311 OPTSET(INP_RECVOPTS
);
2314 case IP_RECVRETOPTS
:
2315 OPTSET(INP_RECVRETOPTS
);
2318 case IP_RECVDSTADDR
:
2319 OPTSET(INP_RECVDSTADDR
);
2327 OPTSET(INP_RECVTTL
);
2330 #if defined(NFAITH) && NFAITH > 0
2335 case IP_RECVPKTINFO
:
2336 OPTSET(INP_PKTINFO
);
2342 #if CONFIG_FORCE_OUT_IFP
2344 * Apple private interface, similar to IP_BOUND_IF, except
2345 * that the parameter is a NULL-terminated string containing
2346 * the name of the network interface; an emptry string means
2347 * unbind. Applications are encouraged to use IP_BOUND_IF
2348 * instead, as that is the current "official" API.
2350 case IP_FORCE_OUT_IFP
: {
2351 char ifname
[IFNAMSIZ
];
2352 unsigned int ifscope
;
2354 /* This option is settable only for IPv4 */
2355 if (!(inp
->inp_vflag
& INP_IPV4
)) {
2360 /* Verify interface name parameter is sane */
2361 if (sopt
->sopt_valsize
> sizeof(ifname
)) {
2366 /* Copy the interface name */
2367 if (sopt
->sopt_valsize
!= 0) {
2368 error
= sooptcopyin(sopt
, ifname
,
2369 sizeof (ifname
), sopt
->sopt_valsize
);
2374 if (sopt
->sopt_valsize
== 0 || ifname
[0] == '\0') {
2375 /* Unbind this socket from any interface */
2376 ifscope
= IFSCOPE_NONE
;
2380 /* Verify name is NULL terminated */
2381 if (ifname
[sopt
->sopt_valsize
- 1] != '\0') {
2386 /* Bail out if given bogus interface name */
2387 if (ifnet_find_by_name(ifname
, &ifp
) != 0) {
2392 /* Bind this socket to this interface */
2393 ifscope
= ifp
->if_index
;
2396 * Won't actually free; since we don't release
2397 * this later, we should do it now.
2401 inp_bindif(inp
, ifscope
);
2406 * Multicast socket options are processed by the in_mcast
2409 case IP_MULTICAST_IF
:
2410 case IP_MULTICAST_IFINDEX
:
2411 case IP_MULTICAST_VIF
:
2412 case IP_MULTICAST_TTL
:
2413 case IP_MULTICAST_LOOP
:
2414 case IP_ADD_MEMBERSHIP
:
2415 case IP_DROP_MEMBERSHIP
:
2416 case IP_ADD_SOURCE_MEMBERSHIP
:
2417 case IP_DROP_SOURCE_MEMBERSHIP
:
2418 case IP_BLOCK_SOURCE
:
2419 case IP_UNBLOCK_SOURCE
:
2421 case MCAST_JOIN_GROUP
:
2422 case MCAST_LEAVE_GROUP
:
2423 case MCAST_JOIN_SOURCE_GROUP
:
2424 case MCAST_LEAVE_SOURCE_GROUP
:
2425 case MCAST_BLOCK_SOURCE
:
2426 case MCAST_UNBLOCK_SOURCE
:
2427 error
= inp_setmoptions(inp
, sopt
);
2431 error
= sooptcopyin(sopt
, &optval
, sizeof optval
,
2437 case IP_PORTRANGE_DEFAULT
:
2438 inp
->inp_flags
&= ~(INP_LOWPORT
);
2439 inp
->inp_flags
&= ~(INP_HIGHPORT
);
2442 case IP_PORTRANGE_HIGH
:
2443 inp
->inp_flags
&= ~(INP_LOWPORT
);
2444 inp
->inp_flags
|= INP_HIGHPORT
;
2447 case IP_PORTRANGE_LOW
:
2448 inp
->inp_flags
&= ~(INP_HIGHPORT
);
2449 inp
->inp_flags
|= INP_LOWPORT
;
2459 case IP_IPSEC_POLICY
:
2467 if ((error
= soopt_getm(sopt
, &m
)) != 0) /* XXX */
2469 if ((error
= soopt_mcopyin(sopt
, m
)) != 0) /* XXX */
2471 priv
= (proc_suser(sopt
->sopt_p
) == 0);
2473 req
= mtod(m
, caddr_t
);
2476 optname
= sopt
->sopt_name
;
2477 error
= ipsec4_set_policy(inp
, optname
, req
, len
, priv
);
2484 case IP_TRAFFIC_MGT_BACKGROUND
:
2486 unsigned background
= 0;
2487 error
= sooptcopyin(sopt
, &background
, sizeof(background
), sizeof(background
));
2492 socket_set_traffic_mgt_flags_locked(so
,
2493 TRAFFIC_MGT_SO_BACKGROUND
);
2495 socket_clear_traffic_mgt_flags_locked(so
,
2496 TRAFFIC_MGT_SO_BACKGROUND
);
2501 #endif /* TRAFFIC_MGT */
2504 * On a multihomed system, scoped routing can be used to
2505 * restrict the source interface used for sending packets.
2506 * The socket option IP_BOUND_IF binds a particular AF_INET
2507 * socket to an interface such that data sent on the socket
2508 * is restricted to that interface. This is unlike the
2509 * SO_DONTROUTE option where the routing table is bypassed;
2510 * therefore it allows for a greater flexibility and control
2511 * over the system behavior, and does not place any restriction
2512 * on the destination address type (e.g. unicast, multicast,
2513 * or broadcast if applicable) or whether or not the host is
2514 * directly reachable. Note that in the multicast transmit
2515 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2516 * IP_BOUND_IF, since the former practically bypasses the
2517 * routing table; in this case, IP_BOUND_IF sets the default
2518 * interface used for sending multicast packets in the absence
2519 * of an explicit multicast transmit interface.
2522 /* This option is settable only for IPv4 */
2523 if (!(inp
->inp_vflag
& INP_IPV4
)) {
2528 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
2534 inp_bindif(inp
, optval
);
2537 case IP_NO_IFT_CELLULAR
:
2538 /* This option is settable only for IPv4 */
2539 if (!(inp
->inp_vflag
& INP_IPV4
)) {
2544 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
2550 error
= inp_nocellular(inp
, optval
);
2554 /* This option is not settable */
2559 error
= ENOPROTOOPT
;
2565 switch (sopt
->sopt_name
) {
2568 if (inp
->inp_options
)
2569 error
= sooptcopyout(sopt
,
2570 mtod(inp
->inp_options
,
2572 inp
->inp_options
->m_len
);
2574 sopt
->sopt_valsize
= 0;
2580 case IP_RECVRETOPTS
:
2581 case IP_RECVDSTADDR
:
2585 #if defined(NFAITH) && NFAITH > 0
2588 case IP_RECVPKTINFO
:
2589 switch (sopt
->sopt_name
) {
2592 optval
= inp
->inp_ip_tos
;
2596 optval
= inp
->inp_ip_ttl
;
2599 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
2602 optval
= OPTBIT(INP_RECVOPTS
);
2605 case IP_RECVRETOPTS
:
2606 optval
= OPTBIT(INP_RECVRETOPTS
);
2609 case IP_RECVDSTADDR
:
2610 optval
= OPTBIT(INP_RECVDSTADDR
);
2614 optval
= OPTBIT(INP_RECVIF
);
2618 optval
= OPTBIT(INP_RECVTTL
);
2622 if (inp
->inp_flags
& INP_HIGHPORT
)
2623 optval
= IP_PORTRANGE_HIGH
;
2624 else if (inp
->inp_flags
& INP_LOWPORT
)
2625 optval
= IP_PORTRANGE_LOW
;
2630 #if defined(NFAITH) && NFAITH > 0
2632 optval
= OPTBIT(INP_FAITH
);
2635 case IP_RECVPKTINFO
:
2636 optval
= OPTBIT(INP_PKTINFO
);
2639 error
= sooptcopyout(sopt
, &optval
, sizeof optval
);
2642 case IP_MULTICAST_IF
:
2643 case IP_MULTICAST_IFINDEX
:
2644 case IP_MULTICAST_VIF
:
2645 case IP_MULTICAST_TTL
:
2646 case IP_MULTICAST_LOOP
:
2648 error
= inp_getmoptions(inp
, sopt
);
2652 case IP_IPSEC_POLICY
:
2654 struct mbuf
*m
= NULL
;
2659 req
= mtod(m
, caddr_t
);
2662 error
= ipsec4_get_policy(sotoinpcb(so
), req
, len
, &m
);
2664 error
= soopt_mcopyout(sopt
, m
); /* XXX */
2672 case IP_TRAFFIC_MGT_BACKGROUND
:
2674 unsigned background
= (so
->so_traffic_mgt_flags
& TRAFFIC_MGT_SO_BACKGROUND
);
2675 return (sooptcopyout(sopt
, &background
, sizeof(background
)));
2678 #endif /* TRAFFIC_MGT */
2681 if (inp
->inp_flags
& INP_BOUND_IF
)
2682 optval
= inp
->inp_boundif
;
2683 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
2686 case IP_NO_IFT_CELLULAR
:
2687 optval
= (inp
->inp_flags
& INP_NO_IFT_CELLULAR
) ? 1 : 0;
2688 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
2692 optval
= inp
->inp_last_outif
;
2693 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
2697 error
= ENOPROTOOPT
;
2706 * Set up IP options in pcb for insertion in output packets.
2707 * Store in mbuf with pointer in pcbopt, adding pseudo-option
2708 * with destination address if source routed.
2712 __unused
int optname
,
2713 struct mbuf
**pcbopt
,
2714 register struct mbuf
*m
)
2716 register int cnt
, optlen
;
2717 register u_char
*cp
;
2720 /* turn off any old options */
2722 (void)m_free(*pcbopt
);
2724 if (m
== (struct mbuf
*)0 || m
->m_len
== 0) {
2726 * Only turning off any previous options.
2734 if (m
->m_len
% sizeof(int32_t))
2738 * IP first-hop destination address will be stored before
2739 * actual options; move other options back
2740 * and clear it when none present.
2742 if (m
->m_data
+ m
->m_len
+ sizeof(struct in_addr
) >= &m
->m_dat
[MLEN
])
2745 m
->m_len
+= sizeof(struct in_addr
);
2746 cp
= mtod(m
, u_char
*) + sizeof(struct in_addr
);
2747 ovbcopy(mtod(m
, caddr_t
), (caddr_t
)cp
, (unsigned)cnt
);
2748 bzero(mtod(m
, caddr_t
), sizeof(struct in_addr
));
2750 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
2751 opt
= cp
[IPOPT_OPTVAL
];
2752 if (opt
== IPOPT_EOL
)
2754 if (opt
== IPOPT_NOP
)
2757 if (cnt
< IPOPT_OLEN
+ sizeof(*cp
))
2759 optlen
= cp
[IPOPT_OLEN
];
2760 if (optlen
< IPOPT_OLEN
+ sizeof(*cp
) || optlen
> cnt
)
2771 * user process specifies route as:
2773 * D must be our final destination (but we can't
2774 * check that since we may not have connected yet).
2775 * A is first hop destination, which doesn't appear in
2776 * actual IP option, but is stored before the options.
2778 if (optlen
< IPOPT_MINOFF
- 1 + sizeof(struct in_addr
))
2780 m
->m_len
-= sizeof(struct in_addr
);
2781 cnt
-= sizeof(struct in_addr
);
2782 optlen
-= sizeof(struct in_addr
);
2783 cp
[IPOPT_OLEN
] = optlen
;
2785 * Move first hop before start of options.
2787 bcopy((caddr_t
)&cp
[IPOPT_OFFSET
+1], mtod(m
, caddr_t
),
2788 sizeof(struct in_addr
));
2790 * Then copy rest of options back
2791 * to close up the deleted entry.
2793 ovbcopy((caddr_t
)(&cp
[IPOPT_OFFSET
+1] +
2794 sizeof(struct in_addr
)),
2795 (caddr_t
)&cp
[IPOPT_OFFSET
+1],
2796 (unsigned)cnt
+ sizeof(struct in_addr
));
2800 if (m
->m_len
> MAX_IPOPTLEN
+ sizeof(struct in_addr
))
2811 ip_moptions_init(void)
2813 PE_parse_boot_argn("ifa_debug", &imo_debug
, sizeof (imo_debug
));
2815 imo_size
= (imo_debug
== 0) ? sizeof (struct ip_moptions
) :
2816 sizeof (struct ip_moptions_dbg
);
2818 imo_zone
= zinit(imo_size
, IMO_ZONE_MAX
* imo_size
, 0,
2820 if (imo_zone
== NULL
) {
2821 panic("%s: failed allocating %s", __func__
, IMO_ZONE_NAME
);
2824 zone_change(imo_zone
, Z_EXPAND
, TRUE
);
2828 imo_addref(struct ip_moptions
*imo
, int locked
)
2833 IMO_LOCK_ASSERT_HELD(imo
);
2835 if (++imo
->imo_refcnt
== 0) {
2836 panic("%s: imo %p wraparound refcnt\n", __func__
, imo
);
2838 } else if (imo
->imo_trace
!= NULL
) {
2839 (*imo
->imo_trace
)(imo
, TRUE
);
2847 imo_remref(struct ip_moptions
*imo
)
2852 if (imo
->imo_refcnt
== 0) {
2853 panic("%s: imo %p negative refcnt", __func__
, imo
);
2855 } else if (imo
->imo_trace
!= NULL
) {
2856 (*imo
->imo_trace
)(imo
, FALSE
);
2860 if (imo
->imo_refcnt
> 0) {
2865 for (i
= 0; i
< imo
->imo_num_memberships
; ++i
) {
2866 struct in_mfilter
*imf
;
2868 imf
= imo
->imo_mfilters
? &imo
->imo_mfilters
[i
] : NULL
;
2872 (void) in_leavegroup(imo
->imo_membership
[i
], imf
);
2877 INM_REMREF(imo
->imo_membership
[i
]);
2878 imo
->imo_membership
[i
] = NULL
;
2880 imo
->imo_num_memberships
= 0;
2881 if (imo
->imo_mfilters
!= NULL
) {
2882 FREE(imo
->imo_mfilters
, M_INMFILTER
);
2883 imo
->imo_mfilters
= NULL
;
2885 if (imo
->imo_membership
!= NULL
) {
2886 FREE(imo
->imo_membership
, M_IPMOPTS
);
2887 imo
->imo_membership
= NULL
;
2891 lck_mtx_destroy(&imo
->imo_lock
, ifa_mtx_grp
);
2893 if (!(imo
->imo_debug
& IFD_ALLOC
)) {
2894 panic("%s: imo %p cannot be freed", __func__
, imo
);
2897 zfree(imo_zone
, imo
);
2901 imo_trace(struct ip_moptions
*imo
, int refhold
)
2903 struct ip_moptions_dbg
*imo_dbg
= (struct ip_moptions_dbg
*)imo
;
2908 if (!(imo
->imo_debug
& IFD_DEBUG
)) {
2909 panic("%s: imo %p has no debug structure", __func__
, imo
);
2913 cnt
= &imo_dbg
->imo_refhold_cnt
;
2914 tr
= imo_dbg
->imo_refhold
;
2916 cnt
= &imo_dbg
->imo_refrele_cnt
;
2917 tr
= imo_dbg
->imo_refrele
;
2920 idx
= atomic_add_16_ov(cnt
, 1) % IMO_TRACE_HIST_SIZE
;
2921 ctrace_record(&tr
[idx
]);
2924 struct ip_moptions
*
2925 ip_allocmoptions(int how
)
2927 struct ip_moptions
*imo
;
2929 imo
= (how
== M_WAITOK
) ? zalloc(imo_zone
) : zalloc_noblock(imo_zone
);
2931 bzero(imo
, imo_size
);
2932 lck_mtx_init(&imo
->imo_lock
, ifa_mtx_grp
, ifa_mtx_attr
);
2933 imo
->imo_debug
|= IFD_ALLOC
;
2934 if (imo_debug
!= 0) {
2935 imo
->imo_debug
|= IFD_DEBUG
;
2936 imo
->imo_trace
= imo_trace
;
2945 * Routine called from ip_output() to loop back a copy of an IP multicast
2946 * packet to the input queue of a specified interface. Note that this
2947 * calls the output routine of the loopback "driver", but with an interface
2948 * pointer that might NOT be a loopback interface -- evil, but easier than
2949 * replicating that code here.
2952 ip_mloopback(ifp
, m
, dst
, hlen
)
2954 register struct mbuf
*m
;
2955 register struct sockaddr_in
*dst
;
2958 register struct ip
*ip
;
2960 int sw_csum
= (apple_hwcksum_tx
== 0);
2962 copym
= m_copy(m
, 0, M_COPYALL
);
2963 if (copym
!= NULL
&& (copym
->m_flags
& M_EXT
|| copym
->m_len
< hlen
))
2964 copym
= m_pullup(copym
, hlen
);
2970 * We don't bother to fragment if the IP length is greater
2971 * than the interface's MTU. Can this possibly matter?
2973 ip
= mtod(copym
, struct ip
*);
2975 #if BYTE_ORDER != BIG_ENDIAN
2981 ip
->ip_sum
= in_cksum(copym
, hlen
);
2984 * It's not clear whether there are any lingering
2985 * reentrancy problems in other areas which might
2986 * be exposed by using ip_input directly (in
2987 * particular, everything which modifies the packet
2988 * in-place). Yet another option is using the
2989 * protosw directly to deliver the looped back
2990 * packet. For the moment, we'll err on the side
2991 * of safety by using if_simloop().
2994 if (dst
->sin_family
!= AF_INET
) {
2995 printf("ip_mloopback: bad address family %d\n",
2997 dst
->sin_family
= AF_INET
;
3002 * Mark checksum as valid or calculate checksum for loopback.
3004 * This is done this way because we have to embed the ifp of
3005 * the interface we will send the original copy of the packet
3006 * out on in the mbuf. ip_input will check if_hwassist of the
3007 * embedded ifp and ignore all csum_flags if if_hwassist is 0.
3008 * The UDP checksum has not been calculated yet.
3010 if (sw_csum
|| (copym
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
)) {
3011 if (!sw_csum
&& IF_HWASSIST_CSUM_FLAGS(ifp
->if_hwassist
)) {
3012 copym
->m_pkthdr
.csum_flags
|=
3013 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
|
3014 CSUM_IP_CHECKED
| CSUM_IP_VALID
;
3015 copym
->m_pkthdr
.csum_data
= 0xffff;
3018 #if BYTE_ORDER != BIG_ENDIAN
3022 in_delayed_cksum(copym
);
3024 #if BYTE_ORDER != BIG_ENDIAN
3033 * We need to send all loopback traffic down to dlil in case
3034 * a filter has tapped-in.
3038 * Stuff the 'real' ifp into the pkthdr, to be used in matching
3039 * in ip_input(); we need the loopback ifp/dl_tag passed as args
3040 * to make the loopback driver compliant with the data link
3044 copym
->m_pkthdr
.rcvif
= ifp
;
3045 dlil_output(lo_ifp
, PF_INET
, copym
, 0,
3046 (struct sockaddr
*) dst
, 0);
3048 printf("Warning: ip_output call to dlil_find_dltag failed!\n");
3054 * Given a source IP address (and route, if available), determine the best
3055 * interface to send the packet from. Checking for (and updating) the
3056 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3057 * without any locks based on the assumption that ip_output() is single-
3058 * threaded per-pcb, i.e. for any given pcb there can only be one thread
3059 * performing output at the IP layer.
3061 * This routine is analogous to in6_selectroute() for IPv6.
3063 static struct ifaddr
*
3064 in_selectsrcif(struct ip
*ip
, struct route
*ro
, unsigned int ifscope
)
3066 struct ifaddr
*ifa
= NULL
;
3067 struct in_addr src
= ip
->ip_src
;
3068 struct in_addr dst
= ip
->ip_dst
;
3069 struct ifnet
*rt_ifp
;
3070 char s_src
[MAX_IPv4_STR_LEN
], s_dst
[MAX_IPv4_STR_LEN
];
3072 if (ip_select_srcif_debug
) {
3073 (void) inet_ntop(AF_INET
, &src
.s_addr
, s_src
, sizeof (s_src
));
3074 (void) inet_ntop(AF_INET
, &dst
.s_addr
, s_dst
, sizeof (s_dst
));
3077 if (ro
->ro_rt
!= NULL
)
3080 rt_ifp
= (ro
->ro_rt
!= NULL
) ? ro
->ro_rt
->rt_ifp
: NULL
;
3083 * Given the source IP address, find a suitable source interface
3084 * to use for transmission; if the caller has specified a scope,
3085 * optimize the search by looking at the addresses only for that
3086 * interface. This is still suboptimal, however, as we need to
3087 * traverse the per-interface list.
3089 if (ifscope
!= IFSCOPE_NONE
|| ro
->ro_rt
!= NULL
) {
3090 unsigned int scope
= ifscope
;
3093 * If no scope is specified and the route is stale (pointing
3094 * to a defunct interface) use the current primary interface;
3095 * this happens when switching between interfaces configured
3096 * with the same IP address. Otherwise pick up the scope
3097 * information from the route; the ULP may have looked up a
3098 * correct route and we just need to verify it here and mark
3099 * it with the ROF_SRCIF_SELECTED flag below.
3101 if (scope
== IFSCOPE_NONE
) {
3102 scope
= rt_ifp
->if_index
;
3103 if (scope
!= get_primary_ifscope(AF_INET
) &&
3104 ro
->ro_rt
->generation_id
!= route_generation
)
3105 scope
= get_primary_ifscope(AF_INET
);
3108 ifa
= (struct ifaddr
*)ifa_foraddr_scoped(src
.s_addr
, scope
);
3110 if (ifa
== NULL
&& ip
->ip_p
!= IPPROTO_UDP
&&
3111 ip
->ip_p
!= IPPROTO_TCP
&& ipforwarding
) {
3113 * If forwarding is enabled, and if the packet isn't
3114 * TCP or UDP, check if the source address belongs
3115 * to one of our own interfaces; if so, demote the
3116 * interface scope and do a route lookup right below.
3118 ifa
= (struct ifaddr
*)ifa_foraddr(src
.s_addr
);
3122 ifscope
= IFSCOPE_NONE
;
3126 if (ip_select_srcif_debug
&& ifa
!= NULL
) {
3127 if (ro
->ro_rt
!= NULL
) {
3128 printf("%s->%s ifscope %d->%d ifa_if %s "
3129 "ro_if %s\n", s_src
, s_dst
, ifscope
,
3130 scope
, if_name(ifa
->ifa_ifp
),
3133 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3134 s_src
, s_dst
, ifscope
, scope
,
3135 if_name(ifa
->ifa_ifp
));
3141 * Slow path; search for an interface having the corresponding source
3142 * IP address if the scope was not specified by the caller, and:
3144 * 1) There currently isn't any route, or,
3145 * 2) The interface used by the route does not own that source
3146 * IP address; in this case, the route will get blown away
3147 * and we'll do a more specific scoped search using the newly
3150 if (ifa
== NULL
&& ifscope
== IFSCOPE_NONE
) {
3151 ifa
= (struct ifaddr
*)ifa_foraddr(src
.s_addr
);
3154 * If we have the IP address, but not the route, we don't
3155 * really know whether or not it belongs to the correct
3156 * interface (it could be shared across multiple interfaces.)
3157 * The only way to find out is to do a route lookup.
3159 if (ifa
!= NULL
&& ro
->ro_rt
== NULL
) {
3161 struct sockaddr_in sin
;
3162 struct ifaddr
*oifa
= NULL
;
3164 bzero(&sin
, sizeof (sin
));
3165 sin
.sin_family
= AF_INET
;
3166 sin
.sin_len
= sizeof (sin
);
3169 lck_mtx_lock(rnh_lock
);
3170 if ((rt
= rt_lookup(TRUE
, (struct sockaddr
*)&sin
, NULL
,
3171 rt_tables
[AF_INET
], IFSCOPE_NONE
)) != NULL
) {
3174 * If the route uses a different interface,
3175 * use that one instead. The IP address of
3176 * the ifaddr that we pick up here is not
3179 if (ifa
->ifa_ifp
!= rt
->rt_ifp
) {
3189 lck_mtx_unlock(rnh_lock
);
3192 struct ifaddr
*iifa
;
3195 * See if the interface pointed to by the
3196 * route is configured with the source IP
3197 * address of the packet.
3199 iifa
= (struct ifaddr
*)ifa_foraddr_scoped(
3200 src
.s_addr
, ifa
->ifa_ifp
->if_index
);
3204 * Found it; drop the original one
3205 * as well as the route interface
3206 * address, and use this instead.
3211 } else if (!ipforwarding
||
3212 (rt
->rt_flags
& RTF_GATEWAY
)) {
3214 * This interface doesn't have that
3215 * source IP address; drop the route
3216 * interface address and just use the
3217 * original one, and let the caller
3218 * do a scoped route lookup.
3224 * Forwarding is enabled and the source
3225 * address belongs to one of our own
3226 * interfaces which isn't the outgoing
3227 * interface, and we have a route, and
3228 * the destination is on a network that
3229 * is directly attached (onlink); drop
3230 * the original one and use the route
3231 * interface address instead.
3236 } else if (ifa
!= NULL
&& ro
->ro_rt
!= NULL
&&
3237 !(ro
->ro_rt
->rt_flags
& RTF_GATEWAY
) &&
3238 ifa
->ifa_ifp
!= ro
->ro_rt
->rt_ifp
&& ipforwarding
) {
3240 * Forwarding is enabled and the source address belongs
3241 * to one of our own interfaces which isn't the same
3242 * as the interface used by the known route; drop the
3243 * original one and use the route interface address.
3246 ifa
= ro
->ro_rt
->rt_ifa
;
3250 if (ip_select_srcif_debug
&& ifa
!= NULL
) {
3251 printf("%s->%s ifscope %d ifa_if %s\n",
3252 s_src
, s_dst
, ifscope
, if_name(ifa
->ifa_ifp
));
3256 if (ro
->ro_rt
!= NULL
)
3257 RT_LOCK_ASSERT_HELD(ro
->ro_rt
);
3259 * If there is a non-loopback route with the wrong interface, or if
3260 * there is no interface configured with such an address, blow it
3261 * away. Except for local/loopback, we look for one with a matching
3262 * interface scope/index.
3264 if (ro
->ro_rt
!= NULL
&&
3265 (ifa
== NULL
|| (ifa
->ifa_ifp
!= rt_ifp
&& rt_ifp
!= lo_ifp
) ||
3266 !(ro
->ro_rt
->rt_flags
& RTF_UP
))) {
3267 if (ip_select_srcif_debug
) {
3269 printf("%s->%s ifscope %d ro_if %s != "
3270 "ifa_if %s (cached route cleared)\n",
3271 s_src
, s_dst
, ifscope
, if_name(rt_ifp
),
3272 if_name(ifa
->ifa_ifp
));
3274 printf("%s->%s ifscope %d ro_if %s "
3275 "(no ifa_if found)\n",
3276 s_src
, s_dst
, ifscope
, if_name(rt_ifp
));
3280 RT_UNLOCK(ro
->ro_rt
);
3283 ro
->ro_flags
&= ~ROF_SRCIF_SELECTED
;
3286 * If the destination is IPv4 LLA and the route's interface
3287 * doesn't match the source interface, then the source IP
3288 * address is wrong; it most likely belongs to the primary
3289 * interface associated with the IPv4 LL subnet. Drop the
3290 * packet rather than letting it go out and return an error
3291 * to the ULP. This actually applies not only to IPv4 LL
3292 * but other shared subnets; for now we explicitly test only
3293 * for the former case and save the latter for future.
3295 if (IN_LINKLOCAL(ntohl(dst
.s_addr
)) &&
3296 !IN_LINKLOCAL(ntohl(src
.s_addr
)) && ifa
!= NULL
) {
3302 if (ip_select_srcif_debug
&& ifa
== NULL
) {
3303 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3304 s_src
, s_dst
, ifscope
);
3308 * If there is a route, mark it accordingly. If there isn't one,
3309 * we'll get here again during the next transmit (possibly with a
3310 * route) and the flag will get set at that point. For IPv4 LLA
3311 * destination, mark it only if the route has been fully resolved;
3312 * otherwise we want to come back here again when the route points
3313 * to the interface over which the ARP reply arrives on.
3315 if (ro
->ro_rt
!= NULL
&& (!IN_LINKLOCAL(ntohl(dst
.s_addr
)) ||
3316 (ro
->ro_rt
->rt_gateway
->sa_family
== AF_LINK
&&
3317 SDL(ro
->ro_rt
->rt_gateway
)->sdl_alen
!= 0))) {
3318 ro
->ro_flags
|= ROF_SRCIF_SELECTED
;
3319 ro
->ro_rt
->generation_id
= route_generation
;
3322 if (ro
->ro_rt
!= NULL
)
3323 RT_UNLOCK(ro
->ro_rt
);