2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993
30 * The Regents of the University of California. All rights reserved.
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/kernel.h>
74 #include <sys/malloc.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/socketvar.h>
79 #include <kern/locks.h>
80 #include <sys/sysctl.h>
81 #include <sys/mcache.h>
82 #include <sys/kdebug.h>
84 #include <machine/endian.h>
85 #include <pexpert/pexpert.h>
88 #include <libkern/OSAtomic.h>
89 #include <libkern/OSByteOrder.h>
92 #include <net/if_dl.h>
93 #include <net/if_types.h>
94 #include <net/route.h>
95 #include <net/ntstat.h>
96 #include <net/net_osdep.h>
98 #include <net/net_perf.h>
100 #include <netinet/in.h>
101 #include <netinet/in_systm.h>
102 #include <netinet/ip.h>
103 #include <netinet/in_pcb.h>
104 #include <netinet/in_var.h>
105 #include <netinet/ip_var.h>
106 #include <netinet/kpi_ipfilter_var.h>
107 #include <netinet/in_tclass.h>
108 #include <netinet/udp.h>
110 #include <netinet6/nd6.h>
112 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1)
113 #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3)
114 #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
115 #define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
118 #include <netinet6/ipsec.h>
119 #include <netkey/key.h>
121 #include <netkey/key_debug.h>
123 #define KEYDEBUG(lev, arg)
128 #include <net/necp.h>
133 #include <netinet/ip_dummynet.h>
137 #include <net/pfvar.h>
143 static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
;
144 static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
;
145 static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
;
146 static void ip_out_cksum_stats(int, u_int32_t
);
147 static struct mbuf
*ip_insertoptions(struct mbuf
*, struct mbuf
*, int *);
148 static int ip_optcopy(struct ip
*, struct ip
*);
149 static int ip_pcbopts(int, struct mbuf
**, struct mbuf
*);
150 static void imo_trace(struct ip_moptions
*, int);
151 static void ip_mloopback(struct ifnet
*, struct ifnet
*, struct mbuf
*,
152 struct sockaddr_in
*, int);
153 static struct ifaddr
*in_selectsrcif(struct ip
*, struct route
*, unsigned int);
155 extern struct ip_linklocal_stat ip_linklocal_stat
;
157 /* temporary: for testing */
159 extern int ipsec_bypass
;
162 static int ip_maxchainsent
= 0;
163 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, maxchainsent
,
164 CTLFLAG_RW
| CTLFLAG_LOCKED
, &ip_maxchainsent
, 0,
165 "use dlil_output_list");
167 static int forge_ce
= 0;
168 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, forge_ce
,
169 CTLFLAG_RW
| CTLFLAG_LOCKED
, &forge_ce
, 0,
173 static int ip_select_srcif_debug
= 0;
174 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, select_srcif_debug
,
175 CTLFLAG_RW
| CTLFLAG_LOCKED
, &ip_select_srcif_debug
, 0,
176 "log source interface selection debug info");
178 static int ip_output_measure
= 0;
179 SYSCTL_PROC(_net_inet_ip
, OID_AUTO
, output_perf
,
180 CTLTYPE_INT
| CTLFLAG_RW
| CTLFLAG_LOCKED
,
181 &ip_output_measure
, 0, sysctl_reset_ip_output_stats
, "I",
182 "Do time measurement");
184 static uint64_t ip_output_measure_bins
= 0;
185 SYSCTL_PROC(_net_inet_ip
, OID_AUTO
, output_perf_bins
,
186 CTLTYPE_QUAD
| CTLFLAG_RW
| CTLFLAG_LOCKED
, &ip_output_measure_bins
, 0,
187 sysctl_ip_output_measure_bins
, "I",
188 "bins for chaining performance data histogram");
190 static net_perf_t net_perf
;
191 SYSCTL_PROC(_net_inet_ip
, OID_AUTO
, output_perf_data
,
192 CTLTYPE_STRUCT
| CTLFLAG_RD
| CTLFLAG_LOCKED
,
193 0, 0, sysctl_ip_output_getperf
, "S,net_perf",
194 "IP output performance data (struct net_perf, net/net_perf.h)");
196 __private_extern__
int rfc6864
= 1;
197 SYSCTL_INT(_net_inet_ip
, OID_AUTO
, rfc6864
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
198 &rfc6864
, 0, "updated ip id field behavior");
200 #define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
203 __private_extern__
unsigned int imo_trace_hist_size
= IMO_TRACE_HIST_SIZE
;
205 struct ip_moptions_dbg
{
206 struct ip_moptions imo
; /* ip_moptions */
207 u_int16_t imo_refhold_cnt
; /* # of IMO_ADDREF */
208 u_int16_t imo_refrele_cnt
; /* # of IMO_REMREF */
210 * Alloc and free callers.
215 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
217 ctrace_t imo_refhold
[IMO_TRACE_HIST_SIZE
];
218 ctrace_t imo_refrele
[IMO_TRACE_HIST_SIZE
];
222 static unsigned int imo_debug
= 1; /* debugging (enabled) */
224 static unsigned int imo_debug
; /* debugging (disabled) */
226 static struct zone
*imo_zone
; /* zone for ip_moptions */
227 #define IMO_ZONE_NAME "ip_moptions" /* zone name */
230 * IP output. The packet in mbuf chain m contains a skeletal IP
231 * header (with len, off, ttl, proto, tos, src, dst).
232 * The mbuf chain containing the packet will be freed.
233 * The mbuf opt, if present, will not be freed.
236 ip_output(struct mbuf
*m0
, struct mbuf
*opt
, struct route
*ro
, int flags
,
237 struct ip_moptions
*imo
, struct ip_out_args
*ipoa
)
239 return ip_output_list(m0
, 0, opt
, ro
, flags
, imo
, ipoa
);
243 * IP output. The packet in mbuf chain m contains a skeletal IP
244 * header (with len, off, ttl, proto, tos, src, dst).
245 * The mbuf chain containing the packet will be freed.
246 * The mbuf opt, if present, will not be freed.
248 * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
249 * skipped and ro->ro_rt would be used. Otherwise the result of route
250 * lookup is stored in ro->ro_rt.
252 * In the IP forwarding case, the packet will arrive with options already
253 * inserted, so must have a NULL opt pointer.
256 ip_output_list(struct mbuf
*m0
, int packetchain
, struct mbuf
*opt
,
257 struct route
*ro
, int flags
, struct ip_moptions
*imo
,
258 struct ip_out_args
*ipoa
)
261 struct ifnet
*ifp
= NULL
; /* not refcnt'd */
262 struct mbuf
*m
= m0
, *prevnxt
= NULL
, **mppn
= &prevnxt
;
263 int hlen
= sizeof(struct ip
);
264 int len
= 0, error
= 0;
265 struct sockaddr_in
*dst
= NULL
;
266 struct in_ifaddr
*ia
= NULL
, *src_ia
= NULL
;
267 struct in_addr pkt_dst
;
268 struct ipf_pktopts
*ippo
= NULL
;
269 ipfilter_t inject_filter_ref
= NULL
;
270 struct mbuf
*packetlist
;
271 uint32_t sw_csum
, pktcnt
= 0, scnt
= 0, bytecnt
= 0;
272 uint32_t packets_processed
= 0;
273 unsigned int ifscope
= IFSCOPE_NONE
;
274 struct flowadv
*adv
= NULL
;
275 struct timeval start_tv
;
277 struct socket
*so
= NULL
;
278 struct secpolicy
*sp
= NULL
;
281 necp_kernel_policy_result necp_result
= 0;
282 necp_kernel_policy_result_parameter necp_result_parameter
;
283 necp_kernel_policy_id necp_matched_policy_id
= 0;
287 struct ip_out_args saved_ipoa
;
288 struct sockaddr_in dst_buf
;
289 #endif /* DUMMYNET */
292 struct ipsec_output_state ipsec_state
;
295 struct route necp_route
;
298 struct ip_fw_args args
;
299 struct route saved_route
;
300 #endif /* DUMMYNET */
301 struct ipf_pktopts ipf_pktopts
;
303 #define ipsec_state ipobz.ipsec_state
304 #define necp_route ipobz.necp_route
305 #define args ipobz.args
306 #define sro_fwd ipobz.sro_fwd
307 #define saved_route ipobz.saved_route
308 #define ipf_pktopts ipobz.ipf_pktopts
311 boolean_t select_srcif
: 1; /* set once */
312 boolean_t srcbound
: 1; /* set once */
313 boolean_t nocell
: 1; /* set once */
314 boolean_t isbroadcast
: 1;
315 boolean_t didfilter
: 1;
316 boolean_t noexpensive
: 1; /* set once */
317 boolean_t noconstrained
: 1; /* set once */
318 boolean_t awdl_unrestricted
: 1; /* set once */
321 } ipobf
= { .raw
= 0 };
323 int interface_mtu
= 0;
326 * Here we check for restrictions when sending frames.
327 * N.B.: IPv4 over internal co-processor interfaces is not allowed.
329 #define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \
330 (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \
331 ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \
332 ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) || \
333 (IFNET_IS_INTCOPROC(_ifp)) || \
334 (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
336 if (ip_output_measure
) {
337 net_perf_start_time(&net_perf
, &start_tv
);
339 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT
| DBG_FUNC_START
, 0, 0, 0, 0, 0);
341 VERIFY(m0
->m_flags
& M_PKTHDR
);
344 /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
345 bzero(&ipobz
, sizeof(ipobz
));
349 if (SLIST_EMPTY(&m0
->m_pkthdr
.tags
)) {
353 /* Grab info from mtags prepended to the chain */
354 if ((tag
= m_tag_locate(m0
, KERNEL_MODULE_TAG_ID
,
355 KERNEL_TAG_TYPE_DUMMYNET
, NULL
)) != NULL
) {
356 struct dn_pkt_tag
*dn_tag
;
358 dn_tag
= (struct dn_pkt_tag
*)(tag
+ 1);
359 args
.fwa_pf_rule
= dn_tag
->dn_pf_rule
;
361 saved_route
= dn_tag
->dn_ro
;
365 bcopy(&dn_tag
->dn_dst
, &dst_buf
, sizeof(dst_buf
));
367 ifp
= dn_tag
->dn_ifp
;
368 flags
= dn_tag
->dn_flags
;
369 if ((dn_tag
->dn_flags
& IP_OUTARGS
)) {
370 saved_ipoa
= dn_tag
->dn_ipoa
;
374 m_tag_delete(m0
, tag
);
377 #endif /* DUMMYNET */
380 m
->m_pkthdr
.pkt_flags
&= ~(PKTF_LOOP
| PKTF_IFAINFO
);
383 if (ipsec_bypass
== 0 && !(flags
& IP_NOIPSEC
)) {
384 /* If packet is bound to an interface, check bound policies */
385 if ((flags
& IP_OUTARGS
) && (ipoa
!= NULL
) &&
386 (ipoa
->ipoa_flags
& IPOAF_BOUND_IF
) &&
387 ipoa
->ipoa_boundif
!= IFSCOPE_NONE
) {
388 if (ipsec4_getpolicybyinterface(m
, IPSEC_DIR_OUTBOUND
,
389 &flags
, ipoa
, &sp
) != 0) {
398 if (flags
& IP_OUTARGS
) {
400 * In the forwarding case, only the ifscope value is used,
401 * as source interface selection doesn't take place.
403 if ((ipobf
.select_srcif
= (!(flags
& IP_FORWARDING
) &&
404 (ipoa
->ipoa_flags
& IPOAF_SELECT_SRCIF
)))) {
405 ipf_pktopts
.ippo_flags
|= IPPOF_SELECT_SRCIF
;
408 if ((ipoa
->ipoa_flags
& IPOAF_BOUND_IF
) &&
409 ipoa
->ipoa_boundif
!= IFSCOPE_NONE
) {
410 ifscope
= ipoa
->ipoa_boundif
;
411 ipf_pktopts
.ippo_flags
|=
412 (IPPOF_BOUND_IF
| (ifscope
<< IPPOF_SHIFT_IFSCOPE
));
415 /* double negation needed for bool bit field */
416 ipobf
.srcbound
= !!(ipoa
->ipoa_flags
& IPOAF_BOUND_SRCADDR
);
417 if (ipobf
.srcbound
) {
418 ipf_pktopts
.ippo_flags
|= IPPOF_BOUND_SRCADDR
;
421 ipobf
.select_srcif
= FALSE
;
422 ipobf
.srcbound
= FALSE
;
423 ifscope
= IFSCOPE_NONE
;
424 if (flags
& IP_OUTARGS
) {
425 ipoa
->ipoa_boundif
= IFSCOPE_NONE
;
426 ipoa
->ipoa_flags
&= ~(IPOAF_SELECT_SRCIF
|
427 IPOAF_BOUND_IF
| IPOAF_BOUND_SRCADDR
);
431 if (flags
& IP_OUTARGS
) {
432 if (ipoa
->ipoa_flags
& IPOAF_NO_CELLULAR
) {
434 ipf_pktopts
.ippo_flags
|= IPPOF_NO_IFT_CELLULAR
;
436 if (ipoa
->ipoa_flags
& IPOAF_NO_EXPENSIVE
) {
437 ipobf
.noexpensive
= TRUE
;
438 ipf_pktopts
.ippo_flags
|= IPPOF_NO_IFF_EXPENSIVE
;
440 if (ipoa
->ipoa_flags
& IPOAF_NO_CONSTRAINED
) {
441 ipobf
.noconstrained
= TRUE
;
442 ipf_pktopts
.ippo_flags
|= IPPOF_NO_IFF_CONSTRAINED
;
444 if (ipoa
->ipoa_flags
& IPOAF_AWDL_UNRESTRICTED
) {
445 ipobf
.awdl_unrestricted
= TRUE
;
447 adv
= &ipoa
->ipoa_flowadv
;
448 adv
->code
= FADV_SUCCESS
;
449 ipoa
->ipoa_retflags
= 0;
453 if (ipsec_bypass
== 0 && !(flags
& IP_NOIPSEC
)) {
454 so
= ipsec_getsocket(m
);
456 (void) ipsec_setsocket(m
, NULL
);
462 if (args
.fwa_pf_rule
!= NULL
) {
463 /* dummynet already saw us */
464 ip
= mtod(m
, struct ip
*);
465 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
466 pkt_dst
= ip
->ip_dst
;
467 if (ro
->ro_rt
!= NULL
) {
468 RT_LOCK_SPIN(ro
->ro_rt
);
469 ia
= (struct in_ifaddr
*)ro
->ro_rt
->rt_ifa
;
471 /* Become a regular mutex */
472 RT_CONVERT_LOCK(ro
->ro_rt
);
473 IFA_ADDREF(&ia
->ia_ifa
);
475 RT_UNLOCK(ro
->ro_rt
);
478 if (args
.fwa_pf_rule
!= NULL
) {
482 #endif /* DUMMYNET */
486 ipobf
.isbroadcast
= FALSE
;
487 ipobf
.didfilter
= FALSE
;
489 VERIFY(m
->m_flags
& M_PKTHDR
);
491 * No need to proccess packet twice if we've already seen it.
493 if (!SLIST_EMPTY(&m
->m_pkthdr
.tags
)) {
494 inject_filter_ref
= ipf_get_inject_filter(m
);
496 inject_filter_ref
= NULL
;
500 m
= ip_insertoptions(m
, opt
, &len
);
502 /* Update the chain */
504 if (m0
== packetlist
) {
510 ip
= mtod(m
, struct ip
*);
512 pkt_dst
= ip
->ip_dst
;
515 * We must not send if the packet is destined to network zero.
516 * RFC1122 3.2.1.3 (a) and (b).
518 if (IN_ZERONET(ntohl(pkt_dst
.s_addr
))) {
519 error
= EHOSTUNREACH
;
526 if (!(flags
& (IP_FORWARDING
| IP_RAWOUTPUT
))) {
527 ip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, hlen
>> 2);
529 if (rfc6864
&& IP_OFF_IS_ATOMIC(ip
->ip_off
)) {
530 // Per RFC6864, value of ip_id is undefined for atomic ip packets
533 ip
->ip_id
= ip_randomid();
535 OSAddAtomic(1, &ipstat
.ips_localout
);
537 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
541 /* For debugging, we let the stack forge congestion */
543 ((ip
->ip_tos
& IPTOS_ECN_MASK
) == IPTOS_ECN_ECT1
||
544 (ip
->ip_tos
& IPTOS_ECN_MASK
) == IPTOS_ECN_ECT0
)) {
545 ip
->ip_tos
= (ip
->ip_tos
& ~IPTOS_ECN_MASK
) | IPTOS_ECN_CE
;
550 KERNEL_DEBUG(DBG_LAYER_BEG
, ip
->ip_dst
.s_addr
, ip
->ip_src
.s_addr
,
551 ip
->ip_p
, ip
->ip_off
, ip
->ip_len
);
553 dst
= SIN(&ro
->ro_dst
);
556 * If there is a cached route,
557 * check that it is to the same destination
558 * and is still up. If not, free it and try again.
559 * The address family should also be checked in case of sharing the
563 if (ro
->ro_rt
!= NULL
) {
564 if (ROUTE_UNUSABLE(ro
) && ip
->ip_src
.s_addr
!= INADDR_ANY
&&
565 !(flags
& (IP_ROUTETOIF
| IP_FORWARDING
))) {
566 src_ia
= ifa_foraddr(ip
->ip_src
.s_addr
);
567 if (src_ia
== NULL
) {
568 error
= EADDRNOTAVAIL
;
571 IFA_REMREF(&src_ia
->ia_ifa
);
575 * Test rt_flags without holding rt_lock for performance
576 * reasons; if the route is down it will hopefully be
577 * caught by the layer below (since it uses this route
578 * as a hint) or during the next transmit.
580 if (ROUTE_UNUSABLE(ro
) || dst
->sin_family
!= AF_INET
||
581 dst
->sin_addr
.s_addr
!= pkt_dst
.s_addr
) {
586 * If we're doing source interface selection, we may not
587 * want to use this route; only synch up the generation
590 if (!ipobf
.select_srcif
&& ro
->ro_rt
!= NULL
&&
591 RT_GENID_OUTOFSYNC(ro
->ro_rt
)) {
592 RT_GENID_SYNC(ro
->ro_rt
);
595 if (ro
->ro_rt
== NULL
) {
596 bzero(dst
, sizeof(*dst
));
597 dst
->sin_family
= AF_INET
;
598 dst
->sin_len
= sizeof(*dst
);
599 dst
->sin_addr
= pkt_dst
;
602 * If routing to interface only,
603 * short circuit routing lookup.
605 if (flags
& IP_ROUTETOIF
) {
607 IFA_REMREF(&ia
->ia_ifa
);
609 if ((ia
= ifatoia(ifa_ifwithdstaddr(sintosa(dst
)))) == NULL
) {
610 ia
= ifatoia(ifa_ifwithnet(sintosa(dst
)));
612 OSAddAtomic(1, &ipstat
.ips_noroute
);
614 /* XXX IPv6 APN fallback notification?? */
620 ipobf
.isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
622 * For consistency with other cases below. Loopback
623 * multicast case is handled separately by ip_mloopback().
625 if ((ifp
->if_flags
& IFF_LOOPBACK
) &&
626 !IN_MULTICAST(ntohl(pkt_dst
.s_addr
))) {
627 m
->m_pkthdr
.rcvif
= ifp
;
628 ip_setsrcifaddr_info(m
, ifp
->if_index
, NULL
);
629 ip_setdstifaddr_info(m
, ifp
->if_index
, NULL
);
631 } else if (IN_MULTICAST(ntohl(pkt_dst
.s_addr
)) &&
632 imo
!= NULL
&& (ifp
= imo
->imo_multicast_ifp
) != NULL
) {
634 * Bypass the normal routing lookup for multicast
635 * packets if the interface is specified.
637 ipobf
.isbroadcast
= FALSE
;
639 IFA_REMREF(&ia
->ia_ifa
);
642 /* Macro takes reference on ia */
645 struct ifaddr
*ia0
= NULL
;
646 boolean_t cloneok
= FALSE
;
648 * Perform source interface selection; the source IP address
649 * must belong to one of the addresses of the interface used
650 * by the route. For performance reasons, do this only if
651 * there is no route, or if the routing table has changed,
652 * or if we haven't done source interface selection on this
653 * route (for this PCB instance) before.
655 if (ipobf
.select_srcif
&&
656 ip
->ip_src
.s_addr
!= INADDR_ANY
&& (ROUTE_UNUSABLE(ro
) ||
657 !(ro
->ro_flags
& ROF_SRCIF_SELECTED
))) {
658 /* Find the source interface */
659 ia0
= in_selectsrcif(ip
, ro
, ifscope
);
662 * If the source address belongs to a restricted
663 * interface and the caller forbids our using
664 * interfaces of such type, pretend that there is no
668 IP_CHECK_RESTRICTIONS(ia0
->ifa_ifp
, ipobf
)) {
671 error
= EHOSTUNREACH
;
672 if (flags
& IP_OUTARGS
) {
673 ipoa
->ipoa_retflags
|= IPOARF_IFDENIED
;
679 * If the source address is spoofed (in the case of
680 * IP_RAWOUTPUT on an unbounded socket), or if this
681 * is destined for local/loopback, just let it go out
682 * using the interface of the route. Otherwise,
683 * there's no interface having such an address,
686 if (ia0
== NULL
&& (!(flags
& IP_RAWOUTPUT
) ||
687 ipobf
.srcbound
) && ifscope
!= lo_ifp
->if_index
) {
688 error
= EADDRNOTAVAIL
;
693 * If the caller didn't explicitly specify the scope,
694 * pick it up from the source interface. If the cached
695 * route was wrong and was blown away as part of source
696 * interface selection, don't mask out RTF_PRCLONING
697 * since that route may have been allocated by the ULP,
698 * unless the IP header was created by the caller or
699 * the destination is IPv4 LLA. The check for the
700 * latter is needed because IPv4 LLAs are never scoped
701 * in the current implementation, and we don't want to
702 * replace the resolved IPv4 LLA route with one whose
703 * gateway points to that of the default gateway on
704 * the primary interface of the system.
707 if (ifscope
== IFSCOPE_NONE
) {
708 ifscope
= ia0
->ifa_ifp
->if_index
;
710 cloneok
= (!(flags
& IP_RAWOUTPUT
) &&
711 !(IN_LINKLOCAL(ntohl(ip
->ip_dst
.s_addr
))));
716 * If this is the case, we probably don't want to allocate
717 * a protocol-cloned route since we didn't get one from the
718 * ULP. This lets TCP do its thing, while not burdening
719 * forwarding or ICMP with the overhead of cloning a route.
720 * Of course, we still want to do any cloning requested by
721 * the link layer, as this is probably required in all cases
722 * for correct operation (as it is for ARP).
724 if (ro
->ro_rt
== NULL
) {
725 uint32_t ign
= RTF_PRCLONING
;
727 * We make an exception here: if the destination
728 * address is INADDR_BROADCAST, allocate a protocol-
729 * cloned host route so that we end up with a route
730 * marked with the RTF_BROADCAST flag. Otherwise,
731 * we would end up referring to the default route,
732 * instead of creating a cloned host route entry.
733 * That would introduce inconsistencies between ULPs
734 * that allocate a route and those that don't. The
735 * RTF_BROADCAST route is important since we'd want
736 * to send out undirected IP broadcast packets using
737 * link-level broadcast address. Another exception
738 * is for ULP-created routes that got blown away by
739 * source interface selection (see above).
741 * These exceptions will no longer be necessary when
742 * the RTF_PRCLONING scheme is no longer present.
744 if (cloneok
|| dst
->sin_addr
.s_addr
== INADDR_BROADCAST
) {
745 ign
&= ~RTF_PRCLONING
;
749 * Loosen the route lookup criteria if the ifscope
750 * corresponds to the loopback interface; this is
751 * needed to support Application Layer Gateways
752 * listening on loopback, in conjunction with packet
753 * filter redirection rules. The final source IP
754 * address will be rewritten by the packet filter
755 * prior to the RFC1122 loopback check below.
757 if (ifscope
== lo_ifp
->if_index
) {
758 rtalloc_ign(ro
, ign
);
760 rtalloc_scoped_ign(ro
, ign
, ifscope
);
764 * If the route points to a cellular/expensive interface
765 * and the caller forbids our using interfaces of such type,
766 * pretend that there is no route.
768 if (ro
->ro_rt
!= NULL
) {
769 RT_LOCK_SPIN(ro
->ro_rt
);
770 if (IP_CHECK_RESTRICTIONS(ro
->ro_rt
->rt_ifp
,
772 RT_UNLOCK(ro
->ro_rt
);
774 if (flags
& IP_OUTARGS
) {
775 ipoa
->ipoa_retflags
|=
779 RT_UNLOCK(ro
->ro_rt
);
784 if (ro
->ro_rt
== NULL
) {
785 OSAddAtomic(1, &ipstat
.ips_noroute
);
786 error
= EHOSTUNREACH
;
795 IFA_REMREF(&ia
->ia_ifa
);
797 RT_LOCK_SPIN(ro
->ro_rt
);
798 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
800 /* Become a regular mutex */
801 RT_CONVERT_LOCK(ro
->ro_rt
);
802 IFA_ADDREF(&ia
->ia_ifa
);
805 * Note: ia_ifp may not be the same as rt_ifp; the latter
806 * is what we use for determining outbound i/f, mtu, etc.
808 ifp
= ro
->ro_rt
->rt_ifp
;
810 if (ro
->ro_rt
->rt_flags
& RTF_GATEWAY
) {
811 dst
= SIN(ro
->ro_rt
->rt_gateway
);
813 if (ro
->ro_rt
->rt_flags
& RTF_HOST
) {
814 /* double negation needed for bool bit field */
816 !!(ro
->ro_rt
->rt_flags
& RTF_BROADCAST
);
818 /* Become a regular mutex */
819 RT_CONVERT_LOCK(ro
->ro_rt
);
820 ipobf
.isbroadcast
= in_broadcast(dst
->sin_addr
, ifp
);
823 * For consistency with IPv6, as well as to ensure that
824 * IP_RECVIF is set correctly for packets that are sent
825 * to one of the local addresses. ia (rt_ifa) would have
826 * been fixed up by rt_setif for local routes. This
827 * would make it appear as if the packet arrives on the
828 * interface which owns the local address. Loopback
829 * multicast case is handled separately by ip_mloopback().
831 if (ia
!= NULL
&& (ifp
->if_flags
& IFF_LOOPBACK
) &&
832 !IN_MULTICAST(ntohl(pkt_dst
.s_addr
))) {
835 m
->m_pkthdr
.rcvif
= ia
->ia_ifa
.ifa_ifp
;
838 srcidx
= ia0
->ifa_ifp
->if_index
;
839 } else if ((ro
->ro_flags
& ROF_SRCIF_SELECTED
) &&
840 ro
->ro_srcia
!= NULL
) {
841 srcidx
= ro
->ro_srcia
->ifa_ifp
->if_index
;
846 ip_setsrcifaddr_info(m
, srcidx
, NULL
);
847 ip_setdstifaddr_info(m
, 0, ia
);
849 RT_UNLOCK(ro
->ro_rt
);
856 if (IN_MULTICAST(ntohl(pkt_dst
.s_addr
))) {
857 struct ifnet
*srcifp
= NULL
;
858 struct in_multi
*inm
;
860 u_int8_t ttl
= IP_DEFAULT_MULTICAST_TTL
;
861 u_int8_t loop
= IP_DEFAULT_MULTICAST_LOOP
;
863 m
->m_flags
|= M_MCAST
;
865 * IP destination address is multicast. Make sure "dst"
866 * still points to the address in "ro". (It may have been
867 * changed to point to a gateway address, above.)
869 dst
= SIN(&ro
->ro_dst
);
871 * See if the caller provided any multicast options
875 vif
= imo
->imo_multicast_vif
;
876 ttl
= imo
->imo_multicast_ttl
;
877 loop
= imo
->imo_multicast_loop
;
878 if (!(flags
& IP_RAWOUTPUT
)) {
881 if (imo
->imo_multicast_ifp
!= NULL
) {
882 ifp
= imo
->imo_multicast_ifp
;
885 } else if (!(flags
& IP_RAWOUTPUT
)) {
890 * Confirm that the outgoing interface supports multicast.
892 if (imo
== NULL
|| vif
== -1) {
893 if (!(ifp
->if_flags
& IFF_MULTICAST
)) {
894 OSAddAtomic(1, &ipstat
.ips_noroute
);
900 * If source address not specified yet, use address
901 * of outgoing interface.
903 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
904 struct in_ifaddr
*ia1
;
905 lck_rw_lock_shared(in_ifaddr_rwlock
);
906 TAILQ_FOREACH(ia1
, &in_ifaddrhead
, ia_link
) {
907 IFA_LOCK_SPIN(&ia1
->ia_ifa
);
908 if (ia1
->ia_ifp
== ifp
) {
909 ip
->ip_src
= IA_SIN(ia1
)->sin_addr
;
911 IFA_UNLOCK(&ia1
->ia_ifa
);
914 IFA_UNLOCK(&ia1
->ia_ifa
);
916 lck_rw_done(in_ifaddr_rwlock
);
917 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
923 in_multihead_lock_shared();
924 IN_LOOKUP_MULTI(&pkt_dst
, ifp
, inm
);
925 in_multihead_lock_done();
926 if (inm
!= NULL
&& (imo
== NULL
|| loop
)) {
928 * If we belong to the destination multicast group
929 * on the outgoing interface, and the caller did not
930 * forbid loopback, loop back a copy.
932 if (!TAILQ_EMPTY(&ipv4_filters
)
934 && !necp_packet_should_skip_filters(m
)
937 struct ipfilter
*filter
;
938 int seen
= (inject_filter_ref
== NULL
);
941 ipf_pktopts
.ippo_flags
|=
943 ipf_pktopts
.ippo_mcast_ifnet
= ifp
;
944 ipf_pktopts
.ippo_mcast_ttl
= ttl
;
945 ipf_pktopts
.ippo_mcast_loop
= loop
;
951 * 4135317 - always pass network byte
954 #if BYTE_ORDER != BIG_ENDIAN
958 TAILQ_FOREACH(filter
, &ipv4_filters
, ipf_link
) {
960 if ((struct ipfilter
*)
961 inject_filter_ref
== filter
) {
964 } else if (filter
->ipf_filter
.
965 ipf_output
!= NULL
) {
967 result
= filter
->ipf_filter
.
971 if (result
== EJUSTRETURN
) {
984 /* set back to host byte order */
985 ip
= mtod(m
, struct ip
*);
986 #if BYTE_ORDER != BIG_ENDIAN
991 ipobf
.didfilter
= TRUE
;
993 ip_mloopback(srcifp
, ifp
, m
, dst
, hlen
);
999 * Multicasts with a time-to-live of zero may be looped-
1000 * back, above, but must not be transmitted on a network.
1001 * Also, multicasts addressed to the loopback interface
1002 * are not sent -- the above call to ip_mloopback() will
1003 * loop back a copy if this host actually belongs to the
1004 * destination group on the loopback interface.
1006 if (ip
->ip_ttl
== 0 || ifp
->if_flags
& IFF_LOOPBACK
) {
1014 * If source address not specified yet, use address
1015 * of outgoing interface.
1017 if (ip
->ip_src
.s_addr
== INADDR_ANY
) {
1018 IFA_LOCK_SPIN(&ia
->ia_ifa
);
1019 ip
->ip_src
= IA_SIN(ia
)->sin_addr
;
1020 IFA_UNLOCK(&ia
->ia_ifa
);
1024 * Look for broadcast address and
1025 * and verify user is allowed to send
1028 if (ipobf
.isbroadcast
) {
1029 if (!(ifp
->if_flags
& IFF_BROADCAST
)) {
1030 error
= EADDRNOTAVAIL
;
1033 if (!(flags
& IP_ALLOWBROADCAST
)) {
1037 /* don't allow broadcast messages to be fragmented */
1038 if ((u_short
)ip
->ip_len
> ifp
->if_mtu
) {
1042 m
->m_flags
|= M_BCAST
;
1044 m
->m_flags
&= ~M_BCAST
;
1049 /* Invoke outbound packet filter */
1050 if (PF_IS_ENABLED
) {
1053 m0
= m
; /* Save for later */
1059 args
.fwa_oflags
= flags
;
1060 if (flags
& IP_OUTARGS
) {
1061 args
.fwa_ipoa
= ipoa
;
1063 rc
= pf_af_hook(ifp
, mppn
, &m
, AF_INET
, FALSE
, &args
);
1064 #else /* DUMMYNET */
1065 rc
= pf_af_hook(ifp
, mppn
, &m
, AF_INET
, FALSE
, NULL
);
1066 #endif /* DUMMYNET */
1067 if (rc
!= 0 || m
== NULL
) {
1068 /* Move to the next packet */
1071 /* Skip ahead if first packet in list got dropped */
1072 if (packetlist
== m0
) {
1078 /* Next packet in the chain */
1080 } else if (packetlist
!= NULL
) {
1081 /* No more packet; send down the chain */
1084 /* Nothing left; we're done */
1088 ip
= mtod(m
, struct ip
*);
1089 pkt_dst
= ip
->ip_dst
;
1090 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
1094 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1096 if (IN_LINKLOCAL(ntohl(ip
->ip_src
.s_addr
)) ||
1097 IN_LINKLOCAL(ntohl(ip
->ip_dst
.s_addr
))) {
1098 ip_linklocal_stat
.iplls_out_total
++;
1099 if (ip
->ip_ttl
!= MAXTTL
) {
1100 ip_linklocal_stat
.iplls_out_badttl
++;
1101 ip
->ip_ttl
= MAXTTL
;
1105 if (!ipobf
.didfilter
&&
1106 !TAILQ_EMPTY(&ipv4_filters
)
1108 && !necp_packet_should_skip_filters(m
)
1111 struct ipfilter
*filter
;
1112 int seen
= (inject_filter_ref
== NULL
);
1113 ipf_pktopts
.ippo_flags
&= ~IPPOF_MCAST_OPTS
;
1116 * Check that a TSO frame isn't passed to a filter.
1117 * This could happen if a filter is inserted while
1118 * TCP is sending the TSO packet.
1120 if (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
) {
1127 /* 4135317 - always pass network byte order to filter */
1128 #if BYTE_ORDER != BIG_ENDIAN
1132 TAILQ_FOREACH(filter
, &ipv4_filters
, ipf_link
) {
1134 if ((struct ipfilter
*)inject_filter_ref
==
1138 } else if (filter
->ipf_filter
.ipf_output
) {
1140 result
= filter
->ipf_filter
.
1141 ipf_output(filter
->ipf_filter
.cookie
,
1142 (mbuf_t
*)&m
, ippo
);
1143 if (result
== EJUSTRETURN
) {
1153 /* set back to host byte order */
1154 ip
= mtod(m
, struct ip
*);
1155 #if BYTE_ORDER != BIG_ENDIAN
1163 /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1164 necp_matched_policy_id
= necp_ip_output_find_policy_match(m
,
1165 flags
, (flags
& IP_OUTARGS
) ? ipoa
: NULL
, ro
? ro
->ro_rt
: NULL
, &necp_result
, &necp_result_parameter
);
1166 if (necp_matched_policy_id
) {
1167 necp_mark_packet_from_ip(m
, necp_matched_policy_id
);
1168 switch (necp_result
) {
1169 case NECP_KERNEL_POLICY_RESULT_PASS
:
1170 if (necp_result_parameter
.pass_flags
& NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC
) {
1173 /* Check if the interface is allowed */
1174 if (!necp_packet_is_allowed_over_interface(m
, ifp
)) {
1175 error
= EHOSTUNREACH
;
1176 OSAddAtomic(1, &ipstat
.ips_necp_policy_drop
);
1180 case NECP_KERNEL_POLICY_RESULT_DROP
:
1181 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT
:
1182 /* Flow divert packets should be blocked at the IP layer */
1183 error
= EHOSTUNREACH
;
1184 OSAddAtomic(1, &ipstat
.ips_necp_policy_drop
);
1186 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL
: {
1187 /* Verify that the packet is being routed to the tunnel */
1188 struct ifnet
*policy_ifp
= necp_get_ifnet_from_result_parameter(&necp_result_parameter
);
1189 if (policy_ifp
== ifp
) {
1190 /* Check if the interface is allowed */
1191 if (!necp_packet_is_allowed_over_interface(m
, ifp
)) {
1192 error
= EHOSTUNREACH
;
1193 OSAddAtomic(1, &ipstat
.ips_necp_policy_drop
);
1198 if (necp_packet_can_rebind_to_ifnet(m
, policy_ifp
, &necp_route
, AF_INET
)) {
1199 /* Check if the interface is allowed */
1200 if (!necp_packet_is_allowed_over_interface(m
, policy_ifp
)) {
1201 error
= EHOSTUNREACH
;
1202 OSAddAtomic(1, &ipstat
.ips_necp_policy_drop
);
1207 * Update the QOS marking policy if
1208 * 1. up layer asks it to do so
1209 * 2. net_qos_policy_restricted is not set
1210 * 3. qos_marking_gencount doesn't match necp_kernel_socket_policies_gencount (checked in necp_lookup_current_qos_marking)
1213 (ipoa
->ipoa_flags
& IPOAF_REDO_QOSMARKING_POLICY
) &&
1214 net_qos_policy_restricted
!= 0) {
1215 bool qos_marking
= (ipoa
->ipoa_flags
& IPOAF_QOSMARKING_ALLOWED
) ? TRUE
: FALSE
;
1216 qos_marking
= necp_lookup_current_qos_marking(&ipoa
->qos_marking_gencount
, NULL
, policy_ifp
, necp_result_parameter
.route_rule_id
, qos_marking
);
1218 ipoa
->ipoa_flags
|= IPOAF_QOSMARKING_ALLOWED
;
1220 ipoa
->ipoa_flags
&= ~IPOAF_QOSMARKING_ALLOWED
;
1224 /* Set ifp to the tunnel interface, since it is compatible with the packet */
1229 error
= ENETUNREACH
;
1230 OSAddAtomic(1, &ipstat
.ips_necp_policy_drop
);
1239 /* Catch-all to check if the interface is allowed */
1240 if (!necp_packet_is_allowed_over_interface(m
, ifp
)) {
1241 error
= EHOSTUNREACH
;
1242 OSAddAtomic(1, &ipstat
.ips_necp_policy_drop
);
1248 if (ipsec_bypass
!= 0 || (flags
& IP_NOIPSEC
)) {
1252 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_START
, 0, 0, 0, 0, 0);
1255 /* get SP for this packet */
1257 sp
= ipsec4_getpolicybysock(m
, IPSEC_DIR_OUTBOUND
,
1260 sp
= ipsec4_getpolicybyaddr(m
, IPSEC_DIR_OUTBOUND
,
1264 IPSEC_STAT_INCREMENT(ipsecstat
.out_inval
);
1265 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1274 switch (sp
->policy
) {
1275 case IPSEC_POLICY_DISCARD
:
1276 case IPSEC_POLICY_GENERATE
:
1278 * This packet is just discarded.
1280 IPSEC_STAT_INCREMENT(ipsecstat
.out_polvio
);
1281 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1285 case IPSEC_POLICY_BYPASS
:
1286 case IPSEC_POLICY_NONE
:
1287 /* no need to do IPsec. */
1288 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1292 case IPSEC_POLICY_IPSEC
:
1293 if (sp
->req
== NULL
) {
1294 /* acquire a policy */
1295 error
= key_spdacquire(sp
);
1296 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1301 /* Verify the redirect to ipsec interface */
1302 if (sp
->ipsec_if
== ifp
) {
1309 case IPSEC_POLICY_ENTRUST
:
1311 printf("ip_output: Invalid policy found. %d\n", sp
->policy
);
1315 if (flags
& IP_ROUTETOIF
) {
1316 bzero(&ipsec_state
.ro
, sizeof(ipsec_state
.ro
));
1318 route_copyout((struct route
*)&ipsec_state
.ro
, ro
, sizeof(struct route
));
1320 ipsec_state
.dst
= SA(dst
);
1326 * delayed checksums are not currently compatible with IPsec
1328 if (m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
1329 in_delayed_cksum(m
);
1332 #if BYTE_ORDER != BIG_ENDIAN
1337 DTRACE_IP6(send
, struct mbuf
*, m
, struct inpcb
*, NULL
,
1338 struct ip
*, ip
, struct ifnet
*, ifp
,
1339 struct ip
*, ip
, struct ip6_hdr
*, NULL
);
1341 error
= ipsec4_output(&ipsec_state
, sp
, flags
);
1342 if (ipsec_state
.tunneled
== 6) {
1348 m0
= m
= ipsec_state
.m
;
1352 * If we're about to use the route in ipsec_state
1353 * and this came from dummynet, cleaup now.
1355 if (ro
== &saved_route
&&
1356 (!(flags
& IP_ROUTETOIF
) || ipsec_state
.tunneled
)) {
1359 #endif /* DUMMYNET */
1361 if (flags
& IP_ROUTETOIF
) {
1363 * if we have tunnel mode SA, we may need to ignore
1366 if (ipsec_state
.tunneled
) {
1367 flags
&= ~IP_ROUTETOIF
;
1368 ro
= (struct route
*)&ipsec_state
.ro
;
1371 ro
= (struct route
*)&ipsec_state
.ro
;
1373 dst
= SIN(ipsec_state
.dst
);
1375 /* mbuf is already reclaimed in ipsec4_output. */
1385 printf("ip4_output (ipsec): error code %d\n", error
);
1388 /* don't show these error codes to the user */
1392 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1398 /* be sure to update variables that are affected by ipsec4_output() */
1399 ip
= mtod(m
, struct ip
*);
1402 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
1403 #else /* !_IP_VHL */
1404 hlen
= ip
->ip_hl
<< 2;
1405 #endif /* !_IP_VHL */
1406 /* Check that there wasn't a route change and src is still valid */
1407 if (ROUTE_UNUSABLE(ro
)) {
1409 VERIFY(src_ia
== NULL
);
1410 if (ip
->ip_src
.s_addr
!= INADDR_ANY
&&
1411 !(flags
& (IP_ROUTETOIF
| IP_FORWARDING
)) &&
1412 (src_ia
= ifa_foraddr(ip
->ip_src
.s_addr
)) == NULL
) {
1413 error
= EADDRNOTAVAIL
;
1414 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1418 if (src_ia
!= NULL
) {
1419 IFA_REMREF(&src_ia
->ia_ifa
);
1424 if (ro
->ro_rt
== NULL
) {
1425 if (!(flags
& IP_ROUTETOIF
)) {
1426 printf("%s: can't update route after "
1427 "IPsec processing\n", __func__
);
1428 error
= EHOSTUNREACH
; /* XXX */
1429 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1435 IFA_REMREF(&ia
->ia_ifa
);
1437 RT_LOCK_SPIN(ro
->ro_rt
);
1438 ia
= ifatoia(ro
->ro_rt
->rt_ifa
);
1440 /* Become a regular mutex */
1441 RT_CONVERT_LOCK(ro
->ro_rt
);
1442 IFA_ADDREF(&ia
->ia_ifa
);
1444 ifp
= ro
->ro_rt
->rt_ifp
;
1445 RT_UNLOCK(ro
->ro_rt
);
1448 /* make it flipped, again. */
1449 #if BYTE_ORDER != BIG_ENDIAN
1453 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT
| DBG_FUNC_END
,
1454 7, 0xff, 0xff, 0xff, 0xff);
1456 /* Pass to filters again */
1457 if (!TAILQ_EMPTY(&ipv4_filters
)
1459 && !necp_packet_should_skip_filters(m
)
1462 struct ipfilter
*filter
;
1464 ipf_pktopts
.ippo_flags
&= ~IPPOF_MCAST_OPTS
;
1467 * Check that a TSO frame isn't passed to a filter.
1468 * This could happen if a filter is inserted while
1469 * TCP is sending the TSO packet.
1471 if (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
) {
1478 /* 4135317 - always pass network byte order to filter */
1479 #if BYTE_ORDER != BIG_ENDIAN
1483 TAILQ_FOREACH(filter
, &ipv4_filters
, ipf_link
) {
1484 if (filter
->ipf_filter
.ipf_output
) {
1486 result
= filter
->ipf_filter
.
1487 ipf_output(filter
->ipf_filter
.cookie
,
1488 (mbuf_t
*)&m
, ippo
);
1489 if (result
== EJUSTRETURN
) {
1499 /* set back to host byte order */
1500 ip
= mtod(m
, struct ip
*);
1501 #if BYTE_ORDER != BIG_ENDIAN
1511 /* 127/8 must not appear on wire - RFC1122 */
1512 if (!(ifp
->if_flags
& IFF_LOOPBACK
) &&
1513 ((ntohl(ip
->ip_src
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
||
1514 (ntohl(ip
->ip_dst
.s_addr
) >> IN_CLASSA_NSHIFT
) == IN_LOOPBACKNET
)) {
1515 OSAddAtomic(1, &ipstat
.ips_badaddr
);
1516 error
= EADDRNOTAVAIL
;
1521 u_int8_t dscp
= ip
->ip_tos
>> IPTOS_DSCP_SHIFT
;
1523 error
= set_packet_qos(m
, ifp
,
1524 ipoa
->ipoa_flags
& IPOAF_QOSMARKING_ALLOWED
? TRUE
: FALSE
,
1525 ipoa
->ipoa_sotc
, ipoa
->ipoa_netsvctype
, &dscp
);
1527 ip
->ip_tos
&= IPTOS_ECN_MASK
;
1528 ip
->ip_tos
|= dscp
<< IPTOS_DSCP_SHIFT
;
1530 printf("%s if_dscp_for_mbuf() error %d\n", __func__
, error
);
1535 ip_output_checksum(ifp
, m
, (IP_VHL_HL(ip
->ip_vhl
) << 2),
1536 ip
->ip_len
, &sw_csum
);
1538 interface_mtu
= ifp
->if_mtu
;
1540 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp
)) {
1541 interface_mtu
= IN6_LINKMTU(ifp
);
1542 /* Further adjust the size for CLAT46 expansion */
1543 interface_mtu
-= CLAT46_HDR_EXPANSION_OVERHD
;
1547 * If small enough for interface, or the interface will take
1548 * care of the fragmentation for us, can just send directly.
1550 if ((u_short
)ip
->ip_len
<= interface_mtu
|| TSO_IPV4_OK(ifp
, m
) ||
1551 (!(ip
->ip_off
& IP_DF
) && (ifp
->if_hwassist
& CSUM_FRAGMENT
))) {
1552 #if BYTE_ORDER != BIG_ENDIAN
1558 if (sw_csum
& CSUM_DELAY_IP
) {
1559 ip
->ip_sum
= ip_cksum_hdr_out(m
, hlen
);
1560 sw_csum
&= ~CSUM_DELAY_IP
;
1561 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_IP
;
1565 /* clean ipsec history once it goes out of the node */
1566 if (ipsec_bypass
== 0 && !(flags
& IP_NOIPSEC
)) {
1570 if ((m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
) &&
1571 (m
->m_pkthdr
.tso_segsz
> 0)) {
1572 scnt
+= m
->m_pkthdr
.len
/ m
->m_pkthdr
.tso_segsz
;
1577 if (packetchain
== 0) {
1578 if (ro
->ro_rt
!= NULL
&& nstat_collect
) {
1579 nstat_route_tx(ro
->ro_rt
, scnt
,
1580 m
->m_pkthdr
.len
, 0);
1583 error
= dlil_output(ifp
, PF_INET
, m
, ro
->ro_rt
,
1585 if (dlil_verbose
&& error
) {
1586 printf("dlil_output error on interface %s: %d\n",
1587 ifp
->if_xname
, error
);
1593 * packet chaining allows us to reuse the
1594 * route for all packets
1596 bytecnt
+= m
->m_pkthdr
.len
;
1597 mppn
= &m
->m_nextpkt
;
1603 if (pktcnt
> ip_maxchainsent
) {
1604 ip_maxchainsent
= pktcnt
;
1606 if (ro
->ro_rt
!= NULL
&& nstat_collect
) {
1607 nstat_route_tx(ro
->ro_rt
, scnt
,
1611 error
= dlil_output(ifp
, PF_INET
, packetlist
,
1612 ro
->ro_rt
, SA(dst
), 0, adv
);
1613 if (dlil_verbose
&& error
) {
1614 printf("dlil_output error on interface %s: %d\n",
1615 ifp
->if_xname
, error
);
1628 VERIFY(interface_mtu
!= 0);
1630 * Too large for interface; fragment if possible.
1631 * Must be able to put at least 8 bytes per fragment.
1632 * Balk when DF bit is set or the interface didn't support TSO.
1634 if ((ip
->ip_off
& IP_DF
) || pktcnt
> 0 ||
1635 (m
->m_pkthdr
.csum_flags
& CSUM_TSO_IPV4
)) {
1638 * This case can happen if the user changed the MTU
1639 * of an interface after enabling IP on it. Because
1640 * most netifs don't keep track of routes pointing to
1641 * them, there is no way for one to update all its
1642 * routes when the MTU is changed.
1645 RT_LOCK_SPIN(ro
->ro_rt
);
1646 if ((ro
->ro_rt
->rt_flags
& (RTF_UP
| RTF_HOST
)) &&
1647 !(ro
->ro_rt
->rt_rmx
.rmx_locks
& RTV_MTU
) &&
1648 (ro
->ro_rt
->rt_rmx
.rmx_mtu
> interface_mtu
)) {
1649 ro
->ro_rt
->rt_rmx
.rmx_mtu
= interface_mtu
;
1651 RT_UNLOCK(ro
->ro_rt
);
1656 OSAddAtomic(1, &ipstat
.ips_cantfrag
);
1661 * XXX Only TCP seems to be passing a list of packets here.
1662 * The following issue is limited to UDP datagrams with 0 checksum.
1663 * For now limit it to the case when single packet is passed down.
1665 if (packetchain
== 0 && IS_INTF_CLAT46(ifp
)) {
1667 * If it is a UDP packet that has checksum set to 0
1668 * and is also not being offloaded, compute a full checksum
1669 * and update the UDP checksum.
1671 if (ip
->ip_p
== IPPROTO_UDP
&&
1672 !(m
->m_pkthdr
.csum_flags
& (CSUM_UDP
| CSUM_PARTIAL
))) {
1673 struct udphdr
*uh
= NULL
;
1675 if (m
->m_len
< hlen
+ sizeof(struct udphdr
)) {
1676 m
= m_pullup(m
, hlen
+ sizeof(struct udphdr
));
1683 ip
= mtod(m
, struct ip
*);
1686 * Get UDP header and if checksum is 0, then compute the full
1689 uh
= (struct udphdr
*)(void *)((caddr_t
)ip
+ hlen
);
1690 if (uh
->uh_sum
== 0) {
1691 uh
->uh_sum
= inet_cksum(m
, IPPROTO_UDP
, hlen
,
1693 if (uh
->uh_sum
== 0) {
1694 uh
->uh_sum
= 0xffff;
1700 error
= ip_fragment(m
, ifp
, interface_mtu
, sw_csum
);
1706 KERNEL_DEBUG(DBG_LAYER_END
, ip
->ip_dst
.s_addr
,
1707 ip
->ip_src
.s_addr
, ip
->ip_p
, ip
->ip_off
, ip
->ip_len
);
1709 for (m
= m0
; m
; m
= m0
) {
1713 /* clean ipsec history once it goes out of the node */
1714 if (ipsec_bypass
== 0 && !(flags
& IP_NOIPSEC
)) {
1719 if ((packetchain
!= 0) && (pktcnt
> 0)) {
1720 panic("%s: mix of packet in packetlist is "
1721 "wrong=%p", __func__
, packetlist
);
1724 if (ro
->ro_rt
!= NULL
&& nstat_collect
) {
1725 nstat_route_tx(ro
->ro_rt
, 1,
1726 m
->m_pkthdr
.len
, 0);
1728 error
= dlil_output(ifp
, PF_INET
, m
, ro
->ro_rt
,
1730 if (dlil_verbose
&& error
) {
1731 printf("dlil_output error on interface %s: %d\n",
1732 ifp
->if_xname
, error
);
1740 OSAddAtomic(1, &ipstat
.ips_fragmented
);
1745 IFA_REMREF(&ia
->ia_ifa
);
1749 ROUTE_RELEASE(&ipsec_state
.ro
);
1751 KEYDEBUG(KEYDEBUG_IPSEC_STAMP
,
1752 printf("DP ip_output call free SP:%x\n", sp
));
1753 key_freesp(sp
, KEY_SADB_UNLOCKED
);
1757 ROUTE_RELEASE(&necp_route
);
1760 ROUTE_RELEASE(&saved_route
);
1761 #endif /* DUMMYNET */
1763 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT
| DBG_FUNC_END
, error
, 0, 0, 0, 0);
1764 if (ip_output_measure
) {
1765 net_perf_measure_time(&net_perf
, &start_tv
, packets_processed
);
1766 net_perf_histogram(&net_perf
, packets_processed
);
1781 #undef IP_CHECK_RESTRICTIONS
1785 ip_fragment(struct mbuf
*m
, struct ifnet
*ifp
, uint32_t mtu
, int sw_csum
)
1787 struct ip
*ip
, *mhip
;
1788 int len
, hlen
, mhlen
, firstlen
, off
, error
= 0;
1789 struct mbuf
**mnext
= &m
->m_nextpkt
, *m0
;
1792 ip
= mtod(m
, struct ip
*);
1794 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2;
1795 #else /* !_IP_VHL */
1796 hlen
= ip
->ip_hl
<< 2;
1797 #endif /* !_IP_VHL */
1800 * We need to adjust the fragment sizes to account
1801 * for IPv6 fragment header if it needs to be translated
1802 * from IPv4 to IPv6.
1804 if (IS_INTF_CLAT46(ifp
)) {
1805 mtu
-= sizeof(struct ip6_frag
);
1808 firstlen
= len
= (mtu
- hlen
) & ~7;
1815 * if the interface will not calculate checksums on
1816 * fragmented packets, then do it here.
1818 if ((m
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) &&
1819 !(ifp
->if_hwassist
& CSUM_IP_FRAGS
)) {
1820 in_delayed_cksum(m
);
1824 * Loop through length of segment after first fragment,
1825 * make new header and copy data of each part and link onto chain.
1828 mhlen
= sizeof(struct ip
);
1829 for (off
= hlen
+ len
; off
< (u_short
)ip
->ip_len
; off
+= len
) {
1830 MGETHDR(m
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
1833 OSAddAtomic(1, &ipstat
.ips_odropped
);
1836 m
->m_flags
|= (m0
->m_flags
& M_MCAST
) | M_FRAG
;
1837 m
->m_data
+= max_linkhdr
;
1838 mhip
= mtod(m
, struct ip
*);
1840 if (hlen
> sizeof(struct ip
)) {
1841 mhlen
= ip_optcopy(ip
, mhip
) + sizeof(struct ip
);
1842 mhip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, mhlen
>> 2);
1845 mhip
->ip_off
= (u_short
)(((off
- hlen
) >> 3) + (ip
->ip_off
& ~IP_MF
));
1846 if (ip
->ip_off
& IP_MF
) {
1847 mhip
->ip_off
|= IP_MF
;
1849 if (off
+ len
>= (u_short
)ip
->ip_len
) {
1850 len
= (u_short
)ip
->ip_len
- off
;
1852 mhip
->ip_off
|= IP_MF
;
1854 mhip
->ip_len
= htons((u_short
)(len
+ mhlen
));
1855 m
->m_next
= m_copy(m0
, off
, len
);
1856 if (m
->m_next
== NULL
) {
1858 error
= ENOBUFS
; /* ??? */
1859 OSAddAtomic(1, &ipstat
.ips_odropped
);
1862 m
->m_pkthdr
.len
= mhlen
+ len
;
1863 m
->m_pkthdr
.rcvif
= NULL
;
1864 m
->m_pkthdr
.csum_flags
= m0
->m_pkthdr
.csum_flags
;
1866 M_COPY_CLASSIFIER(m
, m0
);
1867 M_COPY_PFTAG(m
, m0
);
1869 #if BYTE_ORDER != BIG_ENDIAN
1870 HTONS(mhip
->ip_off
);
1874 if (sw_csum
& CSUM_DELAY_IP
) {
1875 mhip
->ip_sum
= ip_cksum_hdr_out(m
, mhlen
);
1876 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_IP
;
1879 mnext
= &m
->m_nextpkt
;
1882 OSAddAtomic(nfrags
, &ipstat
.ips_ofragments
);
1884 /* set first/last markers for fragment chain */
1885 m
->m_flags
|= M_LASTFRAG
;
1886 m0
->m_flags
|= M_FIRSTFRAG
| M_FRAG
;
1887 m0
->m_pkthdr
.csum_data
= nfrags
;
1890 * Update first fragment by trimming what's been copied out
1891 * and updating header, then send each fragment (in order).
1894 m_adj(m
, hlen
+ firstlen
- (u_short
)ip
->ip_len
);
1895 m
->m_pkthdr
.len
= hlen
+ firstlen
;
1896 ip
->ip_len
= htons((u_short
)m
->m_pkthdr
.len
);
1897 ip
->ip_off
|= IP_MF
;
1899 #if BYTE_ORDER != BIG_ENDIAN
1904 if (sw_csum
& CSUM_DELAY_IP
) {
1905 ip
->ip_sum
= ip_cksum_hdr_out(m
, hlen
);
1906 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_IP
;
1917 ip_out_cksum_stats(int proto
, u_int32_t len
)
1921 tcp_out_cksum_stats(len
);
1924 udp_out_cksum_stats(len
);
1927 /* keep only TCP or UDP stats for now */
1933 * Process a delayed payload checksum calculation (outbound path.)
1935 * hoff is the number of bytes beyond the mbuf data pointer which
1936 * points to the IP header.
1938 * Returns a bitmask representing all the work done in software.
1941 in_finalize_cksum(struct mbuf
*m
, uint32_t hoff
, uint32_t csum_flags
)
1943 unsigned char buf
[15 << 2] __attribute__((aligned(8)));
1945 uint32_t offset
, _hlen
, mlen
, hlen
, len
, sw_csum
;
1946 uint16_t csum
, ip_len
;
1948 _CASSERT(sizeof(csum
) == sizeof(uint16_t));
1949 VERIFY(m
->m_flags
& M_PKTHDR
);
1951 sw_csum
= (csum_flags
& m
->m_pkthdr
.csum_flags
);
1953 if ((sw_csum
&= (CSUM_DELAY_IP
| CSUM_DELAY_DATA
)) == 0) {
1957 mlen
= m
->m_pkthdr
.len
; /* total mbuf len */
1959 /* sanity check (need at least simple IP header) */
1960 if (mlen
< (hoff
+ sizeof(*ip
))) {
1961 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
1962 "(%u+%u)\n", __func__
, m
, mlen
, hoff
,
1963 (uint32_t)sizeof(*ip
));
1968 * In case the IP header is not contiguous, or not 32-bit aligned,
1969 * or if we're computing the IP header checksum, copy it to a local
1970 * buffer. Copy only the simple IP header here (IP options case
1971 * is handled below.)
1973 if ((sw_csum
& CSUM_DELAY_IP
) || (hoff
+ sizeof(*ip
)) > m
->m_len
||
1974 !IP_HDR_ALIGNED_P(mtod(m
, caddr_t
) + hoff
)) {
1975 m_copydata(m
, hoff
, sizeof(*ip
), (caddr_t
)buf
);
1976 ip
= (struct ip
*)(void *)buf
;
1977 _hlen
= sizeof(*ip
);
1979 ip
= (struct ip
*)(void *)(m
->m_data
+ hoff
);
1983 hlen
= IP_VHL_HL(ip
->ip_vhl
) << 2; /* IP header len */
1986 if (mlen
< (hoff
+ hlen
)) {
1987 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
1988 "hoff %u", __func__
, m
, mlen
, hlen
, hoff
);
1993 * We could be in the context of an IP or interface filter; in the
1994 * former case, ip_len would be in host (correct) order while for
1995 * the latter it would be in network order. Because of this, we
1996 * attempt to interpret the length field by comparing it against
1997 * the actual packet length. If the comparison fails, byte swap
1998 * the length and check again. If it still fails, use the actual
1999 * packet length. This also covers the trailing bytes case.
2001 ip_len
= ip
->ip_len
;
2002 if (ip_len
!= (mlen
- hoff
)) {
2003 ip_len
= OSSwapInt16(ip_len
);
2004 if (ip_len
!= (mlen
- hoff
)) {
2005 printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2006 "[swapped %d (%x)] doesn't match actual packet "
2007 "length; %d is used instead\n", __func__
,
2008 (uint64_t)VM_KERNEL_ADDRPERM(m
), ip
->ip_p
,
2009 ip
->ip_len
, ip
->ip_len
, ip_len
, ip_len
,
2011 if (mlen
- hoff
> UINT16_MAX
) {
2012 panic("%s: mlen %u - hoff %u > 65535",
2013 __func__
, mlen
, hoff
);
2015 ip_len
= (uint16_t)(mlen
- hoff
);
2019 len
= ip_len
- hlen
; /* csum span */
2021 if (sw_csum
& CSUM_DELAY_DATA
) {
2025 * offset is added to the lower 16-bit value of csum_data,
2026 * which is expected to contain the ULP offset; therefore
2027 * CSUM_PARTIAL offset adjustment must be undone.
2029 if ((m
->m_pkthdr
.csum_flags
& (CSUM_PARTIAL
| CSUM_DATA_VALID
)) ==
2030 (CSUM_PARTIAL
| CSUM_DATA_VALID
)) {
2032 * Get back the original ULP offset (this will
2033 * undo the CSUM_PARTIAL logic in ip_output.)
2035 m
->m_pkthdr
.csum_data
= (m
->m_pkthdr
.csum_tx_stuff
-
2036 m
->m_pkthdr
.csum_tx_start
);
2039 ulpoff
= (m
->m_pkthdr
.csum_data
& 0xffff); /* ULP csum offset */
2040 offset
= hoff
+ hlen
; /* ULP header */
2042 if (mlen
< (ulpoff
+ sizeof(csum
))) {
2043 panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2044 "cksum offset (%u) cksum flags 0x%x\n", __func__
,
2045 m
, mlen
, ip
->ip_p
, ulpoff
, m
->m_pkthdr
.csum_flags
);
2049 csum
= inet_cksum(m
, 0, offset
, len
);
2052 ip_out_cksum_stats(ip
->ip_p
, len
);
2054 /* RFC1122 4.1.3.4 */
2056 (m
->m_pkthdr
.csum_flags
& (CSUM_UDP
| CSUM_ZERO_INVERT
))) {
2060 /* Insert the checksum in the ULP csum field */
2062 if (offset
+ sizeof(csum
) > m
->m_len
) {
2063 m_copyback(m
, offset
, sizeof(csum
), &csum
);
2064 } else if (IP_HDR_ALIGNED_P(mtod(m
, char *) + hoff
)) {
2065 *(uint16_t *)(void *)(mtod(m
, char *) + offset
) = csum
;
2067 bcopy(&csum
, (mtod(m
, char *) + offset
), sizeof(csum
));
2069 m
->m_pkthdr
.csum_flags
&= ~(CSUM_DELAY_DATA
| CSUM_DATA_VALID
|
2070 CSUM_PARTIAL
| CSUM_ZERO_INVERT
);
2073 if (sw_csum
& CSUM_DELAY_IP
) {
2074 /* IP header must be in the local buffer */
2075 VERIFY(_hlen
== sizeof(*ip
));
2076 if (_hlen
!= hlen
) {
2077 VERIFY(hlen
<= sizeof(buf
));
2078 m_copydata(m
, hoff
, hlen
, (caddr_t
)buf
);
2079 ip
= (struct ip
*)(void *)buf
;
2084 * Compute the IP header checksum as if the IP length
2085 * is the length which we believe is "correct"; see
2086 * how ip_len gets calculated above. Note that this
2087 * is done on the local copy and not on the real one.
2089 ip
->ip_len
= htons(ip_len
);
2091 csum
= in_cksum_hdr_opt(ip
);
2094 ipstat
.ips_snd_swcsum
++;
2095 ipstat
.ips_snd_swcsum_bytes
+= hlen
;
2098 * Insert only the checksum in the existing IP header
2099 * csum field; all other fields are left unchanged.
2101 offset
= hoff
+ offsetof(struct ip
, ip_sum
);
2102 if (offset
+ sizeof(csum
) > m
->m_len
) {
2103 m_copyback(m
, offset
, sizeof(csum
), &csum
);
2104 } else if (IP_HDR_ALIGNED_P(mtod(m
, char *) + hoff
)) {
2105 *(uint16_t *)(void *)(mtod(m
, char *) + offset
) = csum
;
2107 bcopy(&csum
, (mtod(m
, char *) + offset
), sizeof(csum
));
2109 m
->m_pkthdr
.csum_flags
&= ~CSUM_DELAY_IP
;
2117 * Insert IP options into preformed packet.
2118 * Adjust IP destination as required for IP source routing,
2119 * as indicated by a non-zero in_addr at the start of the options.
2121 * XXX This routine assumes that the packet has no options in place.
2123 static struct mbuf
*
2124 ip_insertoptions(struct mbuf
*m
, struct mbuf
*opt
, int *phlen
)
2126 struct ipoption
*p
= mtod(opt
, struct ipoption
*);
2128 struct ip
*ip
= mtod(m
, struct ip
*);
2131 optlen
= opt
->m_len
- sizeof(p
->ipopt_dst
);
2132 if (optlen
+ (u_short
)ip
->ip_len
> IP_MAXPACKET
) {
2133 return m
; /* XXX should fail */
2135 if (p
->ipopt_dst
.s_addr
) {
2136 ip
->ip_dst
= p
->ipopt_dst
;
2138 if (m
->m_flags
& M_EXT
|| m
->m_data
- optlen
< m
->m_pktdat
) {
2139 MGETHDR(n
, M_DONTWAIT
, MT_HEADER
); /* MAC-OK */
2143 n
->m_pkthdr
.rcvif
= 0;
2144 n
->m_pkthdr
.len
= m
->m_pkthdr
.len
+ optlen
;
2145 m
->m_len
-= sizeof(struct ip
);
2146 m
->m_data
+= sizeof(struct ip
);
2149 m
->m_len
= optlen
+ sizeof(struct ip
);
2150 m
->m_data
+= max_linkhdr
;
2151 (void) memcpy(mtod(m
, void *), ip
, sizeof(struct ip
));
2153 m
->m_data
-= optlen
;
2155 m
->m_pkthdr
.len
+= optlen
;
2156 ovbcopy((caddr_t
)ip
, mtod(m
, caddr_t
), sizeof(struct ip
));
2158 ip
= mtod(m
, struct ip
*);
2159 bcopy(p
->ipopt_list
, ip
+ 1, optlen
);
2160 *phlen
= sizeof(struct ip
) + optlen
;
2161 ip
->ip_vhl
= IP_MAKE_VHL(IPVERSION
, *phlen
>> 2);
2162 ip
->ip_len
+= optlen
;
2167 * Copy options from ip to jp,
2168 * omitting those not copied during fragmentation.
2171 ip_optcopy(struct ip
*ip
, struct ip
*jp
)
2174 int opt
, optlen
, cnt
;
2176 cp
= (u_char
*)(ip
+ 1);
2177 dp
= (u_char
*)(jp
+ 1);
2178 cnt
= (IP_VHL_HL(ip
->ip_vhl
) << 2) - sizeof(struct ip
);
2179 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
2181 if (opt
== IPOPT_EOL
) {
2184 if (opt
== IPOPT_NOP
) {
2185 /* Preserve for IP mcast tunnel's LSRR alignment. */
2191 if (cnt
< IPOPT_OLEN
+ sizeof(*cp
)) {
2192 panic("malformed IPv4 option passed to ip_optcopy");
2196 optlen
= cp
[IPOPT_OLEN
];
2198 if (optlen
< IPOPT_OLEN
+ sizeof(*cp
) || optlen
> cnt
) {
2199 panic("malformed IPv4 option passed to ip_optcopy");
2203 /* bogus lengths should have been caught by ip_dooptions */
2207 if (IPOPT_COPIED(opt
)) {
2208 bcopy(cp
, dp
, optlen
);
2212 for (optlen
= (int)(dp
- (u_char
*)(jp
+ 1)); optlen
& 0x3; optlen
++) {
2219 * IP socket option processing.
2222 ip_ctloutput(struct socket
*so
, struct sockopt
*sopt
)
2224 struct inpcb
*inp
= sotoinpcb(so
);
2226 lck_mtx_t
*mutex_held
= NULL
;
2229 if (sopt
->sopt_level
!= IPPROTO_IP
) {
2233 switch (sopt
->sopt_dir
) {
2235 mutex_held
= socket_getlock(so
, PR_F_WILLUNLOCK
);
2237 * Wait if we are in the middle of ip_output
2238 * as we unlocked the socket there and don't
2239 * want to overwrite the IP options
2241 if (inp
->inp_sndinprog_cnt
> 0) {
2242 inp
->inp_sndingprog_waiters
++;
2244 while (inp
->inp_sndinprog_cnt
> 0) {
2245 msleep(&inp
->inp_sndinprog_cnt
, mutex_held
,
2246 PSOCK
| PCATCH
, "inp_sndinprog_cnt", NULL
);
2248 inp
->inp_sndingprog_waiters
--;
2250 switch (sopt
->sopt_name
) {
2257 if (sopt
->sopt_valsize
> MLEN
) {
2261 MGET(m
, sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
,
2267 m
->m_len
= (int32_t)sopt
->sopt_valsize
;
2268 error
= sooptcopyin(sopt
, mtod(m
, char *),
2269 m
->m_len
, m
->m_len
);
2275 return ip_pcbopts(sopt
->sopt_name
,
2276 &inp
->inp_options
, m
);
2282 case IP_RECVRETOPTS
:
2283 case IP_RECVDSTADDR
:
2286 case IP_RECVPKTINFO
:
2289 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
2295 switch (sopt
->sopt_name
) {
2297 if (optval
> UINT8_MAX
) {
2301 inp
->inp_ip_tos
= (uint8_t)optval
;
2305 if (optval
> UINT8_MAX
) {
2309 inp
->inp_ip_ttl
= (uint8_t)optval
;
2311 #define OPTSET(bit) do { \
2313 inp->inp_flags |= bit; \
2315 inp->inp_flags &= ~bit; \
2319 #define OPTSET2(bit) do { \
2321 inp->inp_flags2 |= bit; \
2323 inp->inp_flags2 &= ~bit; \
2328 OPTSET(INP_RECVOPTS
);
2331 case IP_RECVRETOPTS
:
2332 OPTSET(INP_RECVRETOPTS
);
2335 case IP_RECVDSTADDR
:
2336 OPTSET(INP_RECVDSTADDR
);
2344 OPTSET(INP_RECVTTL
);
2347 case IP_RECVPKTINFO
:
2348 OPTSET(INP_PKTINFO
);
2352 OPTSET(INP_RECVTOS
);
2356 /* This option is settable only for IPv4 */
2357 if (!(inp
->inp_vflag
& INP_IPV4
)) {
2361 OPTSET2(INP2_DONTFRAG
);
2368 * Multicast socket options are processed by the in_mcast
2371 case IP_MULTICAST_IF
:
2372 case IP_MULTICAST_IFINDEX
:
2373 case IP_MULTICAST_VIF
:
2374 case IP_MULTICAST_TTL
:
2375 case IP_MULTICAST_LOOP
:
2376 case IP_ADD_MEMBERSHIP
:
2377 case IP_DROP_MEMBERSHIP
:
2378 case IP_ADD_SOURCE_MEMBERSHIP
:
2379 case IP_DROP_SOURCE_MEMBERSHIP
:
2380 case IP_BLOCK_SOURCE
:
2381 case IP_UNBLOCK_SOURCE
:
2383 case MCAST_JOIN_GROUP
:
2384 case MCAST_LEAVE_GROUP
:
2385 case MCAST_JOIN_SOURCE_GROUP
:
2386 case MCAST_LEAVE_SOURCE_GROUP
:
2387 case MCAST_BLOCK_SOURCE
:
2388 case MCAST_UNBLOCK_SOURCE
:
2389 error
= inp_setmoptions(inp
, sopt
);
2393 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
2400 case IP_PORTRANGE_DEFAULT
:
2401 inp
->inp_flags
&= ~(INP_LOWPORT
);
2402 inp
->inp_flags
&= ~(INP_HIGHPORT
);
2405 case IP_PORTRANGE_HIGH
:
2406 inp
->inp_flags
&= ~(INP_LOWPORT
);
2407 inp
->inp_flags
|= INP_HIGHPORT
;
2410 case IP_PORTRANGE_LOW
:
2411 inp
->inp_flags
&= ~(INP_HIGHPORT
);
2412 inp
->inp_flags
|= INP_LOWPORT
;
2422 case IP_IPSEC_POLICY
: {
2429 if ((error
= soopt_getm(sopt
, &m
)) != 0) { /* XXX */
2432 if ((error
= soopt_mcopyin(sopt
, m
)) != 0) { /* XXX */
2435 priv
= (proc_suser(sopt
->sopt_p
) == 0);
2437 req
= mtod(m
, caddr_t
);
2440 optname
= sopt
->sopt_name
;
2441 error
= ipsec4_set_policy(inp
, optname
, req
, len
, priv
);
2448 case IP_TRAFFIC_MGT_BACKGROUND
: {
2449 unsigned background
= 0;
2451 error
= sooptcopyin(sopt
, &background
,
2452 sizeof(background
), sizeof(background
));
2458 socket_set_traffic_mgt_flags_locked(so
,
2459 TRAFFIC_MGT_SO_BACKGROUND
);
2461 socket_clear_traffic_mgt_flags_locked(so
,
2462 TRAFFIC_MGT_SO_BACKGROUND
);
2467 #endif /* TRAFFIC_MGT */
2470 * On a multihomed system, scoped routing can be used to
2471 * restrict the source interface used for sending packets.
2472 * The socket option IP_BOUND_IF binds a particular AF_INET
2473 * socket to an interface such that data sent on the socket
2474 * is restricted to that interface. This is unlike the
2475 * SO_DONTROUTE option where the routing table is bypassed;
2476 * therefore it allows for a greater flexibility and control
2477 * over the system behavior, and does not place any restriction
2478 * on the destination address type (e.g. unicast, multicast,
2479 * or broadcast if applicable) or whether or not the host is
2480 * directly reachable. Note that in the multicast transmit
2481 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2482 * IP_BOUND_IF, since the former practically bypasses the
2483 * routing table; in this case, IP_BOUND_IF sets the default
2484 * interface used for sending multicast packets in the absence
2485 * of an explicit multicast transmit interface.
2488 /* This option is settable only for IPv4 */
2489 if (!(inp
->inp_vflag
& INP_IPV4
)) {
2494 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
2501 error
= inp_bindif(inp
, optval
, NULL
);
2504 case IP_NO_IFT_CELLULAR
:
2505 /* This option is settable only for IPv4 */
2506 if (!(inp
->inp_vflag
& INP_IPV4
)) {
2511 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
2518 /* once set, it cannot be unset */
2519 if (!optval
&& INP_NO_CELLULAR(inp
)) {
2524 error
= so_set_restrictions(so
,
2525 SO_RESTRICT_DENY_CELLULAR
);
2529 /* This option is not settable */
2534 error
= ENOPROTOOPT
;
2540 switch (sopt
->sopt_name
) {
2543 if (inp
->inp_options
) {
2544 error
= sooptcopyout(sopt
,
2545 mtod(inp
->inp_options
, char *),
2546 inp
->inp_options
->m_len
);
2548 sopt
->sopt_valsize
= 0;
2555 case IP_RECVRETOPTS
:
2556 case IP_RECVDSTADDR
:
2560 case IP_RECVPKTINFO
:
2563 switch (sopt
->sopt_name
) {
2565 optval
= inp
->inp_ip_tos
;
2569 optval
= inp
->inp_ip_ttl
;
2572 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
2573 #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
2575 optval
= OPTBIT(INP_RECVOPTS
);
2578 case IP_RECVRETOPTS
:
2579 optval
= OPTBIT(INP_RECVRETOPTS
);
2582 case IP_RECVDSTADDR
:
2583 optval
= OPTBIT(INP_RECVDSTADDR
);
2587 optval
= OPTBIT(INP_RECVIF
);
2591 optval
= OPTBIT(INP_RECVTTL
);
2595 if (inp
->inp_flags
& INP_HIGHPORT
) {
2596 optval
= IP_PORTRANGE_HIGH
;
2597 } else if (inp
->inp_flags
& INP_LOWPORT
) {
2598 optval
= IP_PORTRANGE_LOW
;
2604 case IP_RECVPKTINFO
:
2605 optval
= OPTBIT(INP_PKTINFO
);
2609 optval
= OPTBIT(INP_RECVTOS
);
2612 optval
= OPTBIT2(INP2_DONTFRAG
);
2615 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
2618 case IP_MULTICAST_IF
:
2619 case IP_MULTICAST_IFINDEX
:
2620 case IP_MULTICAST_VIF
:
2621 case IP_MULTICAST_TTL
:
2622 case IP_MULTICAST_LOOP
:
2624 error
= inp_getmoptions(inp
, sopt
);
2628 case IP_IPSEC_POLICY
: {
2629 error
= 0; /* This option is no longer supported */
2635 case IP_TRAFFIC_MGT_BACKGROUND
: {
2636 unsigned background
= (so
->so_flags1
&
2637 SOF1_TRAFFIC_MGT_SO_BACKGROUND
) ? 1 : 0;
2638 return sooptcopyout(sopt
, &background
,
2639 sizeof(background
));
2641 #endif /* TRAFFIC_MGT */
2644 if (inp
->inp_flags
& INP_BOUND_IF
) {
2645 optval
= inp
->inp_boundifp
->if_index
;
2647 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
2650 case IP_NO_IFT_CELLULAR
:
2651 optval
= INP_NO_CELLULAR(inp
) ? 1 : 0;
2652 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
2656 optval
= (inp
->inp_last_outifp
!= NULL
) ?
2657 inp
->inp_last_outifp
->if_index
: 0;
2658 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
2662 error
= ENOPROTOOPT
;
2671 * Set up IP options in pcb for insertion in output packets.
2672 * Store in mbuf with pointer in pcbopt, adding pseudo-option
2673 * with destination address if source routed.
2676 ip_pcbopts(int optname
, struct mbuf
**pcbopt
, struct mbuf
*m
)
2678 #pragma unused(optname)
2683 /* turn off any old options */
2685 (void) m_free(*pcbopt
);
2688 if (m
== (struct mbuf
*)0 || m
->m_len
== 0) {
2690 * Only turning off any previous options.
2698 if (m
->m_len
% sizeof(int32_t)) {
2703 * IP first-hop destination address will be stored before
2704 * actual options; move other options back
2705 * and clear it when none present.
2707 if (m
->m_data
+ m
->m_len
+ sizeof(struct in_addr
) >= &m
->m_dat
[MLEN
]) {
2711 m
->m_len
+= sizeof(struct in_addr
);
2712 cp
= mtod(m
, u_char
*) + sizeof(struct in_addr
);
2713 ovbcopy(mtod(m
, caddr_t
), (caddr_t
)cp
, (unsigned)cnt
);
2714 bzero(mtod(m
, caddr_t
), sizeof(struct in_addr
));
2716 for (; cnt
> 0; cnt
-= optlen
, cp
+= optlen
) {
2717 opt
= cp
[IPOPT_OPTVAL
];
2718 if (opt
== IPOPT_EOL
) {
2721 if (opt
== IPOPT_NOP
) {
2724 if (cnt
< IPOPT_OLEN
+ sizeof(*cp
)) {
2727 optlen
= cp
[IPOPT_OLEN
];
2728 if (optlen
< IPOPT_OLEN
+ sizeof(*cp
) || optlen
> cnt
) {
2739 * user process specifies route as:
2741 * D must be our final destination (but we can't
2742 * check that since we may not have connected yet).
2743 * A is first hop destination, which doesn't appear in
2744 * actual IP option, but is stored before the options.
2746 if (optlen
< IPOPT_MINOFF
- 1 + sizeof(struct in_addr
)) {
2749 if (optlen
> UINT8_MAX
) {
2752 m
->m_len
-= sizeof(struct in_addr
);
2753 cnt
-= sizeof(struct in_addr
);
2754 optlen
-= sizeof(struct in_addr
);
2755 cp
[IPOPT_OLEN
] = (uint8_t)optlen
;
2757 * Move first hop before start of options.
2759 bcopy((caddr_t
)&cp
[IPOPT_OFFSET
+ 1], mtod(m
, caddr_t
),
2760 sizeof(struct in_addr
));
2762 * Then copy rest of options back
2763 * to close up the deleted entry.
2765 ovbcopy((caddr_t
)(&cp
[IPOPT_OFFSET
+ 1] +
2766 sizeof(struct in_addr
)),
2767 (caddr_t
)&cp
[IPOPT_OFFSET
+ 1],
2768 (unsigned)cnt
- (IPOPT_MINOFF
- 1));
2772 if (m
->m_len
> MAX_IPOPTLEN
+ sizeof(struct in_addr
)) {
2784 ip_moptions_init(void)
2786 PE_parse_boot_argn("ifa_debug", &imo_debug
, sizeof(imo_debug
));
2788 vm_size_t imo_size
= (imo_debug
== 0) ? sizeof(struct ip_moptions
) :
2789 sizeof(struct ip_moptions_dbg
);
2791 imo_zone
= zone_create(IMO_ZONE_NAME
, imo_size
, ZC_ZFREE_CLEARMEM
);
2795 imo_addref(struct ip_moptions
*imo
, int locked
)
2800 IMO_LOCK_ASSERT_HELD(imo
);
2803 if (++imo
->imo_refcnt
== 0) {
2804 panic("%s: imo %p wraparound refcnt\n", __func__
, imo
);
2806 } else if (imo
->imo_trace
!= NULL
) {
2807 (*imo
->imo_trace
)(imo
, TRUE
);
2816 imo_remref(struct ip_moptions
*imo
)
2821 if (imo
->imo_refcnt
== 0) {
2822 panic("%s: imo %p negative refcnt", __func__
, imo
);
2824 } else if (imo
->imo_trace
!= NULL
) {
2825 (*imo
->imo_trace
)(imo
, FALSE
);
2829 if (imo
->imo_refcnt
> 0) {
2834 for (i
= 0; i
< imo
->imo_num_memberships
; ++i
) {
2835 struct in_mfilter
*imf
;
2837 imf
= imo
->imo_mfilters
? &imo
->imo_mfilters
[i
] : NULL
;
2842 (void) in_leavegroup(imo
->imo_membership
[i
], imf
);
2848 INM_REMREF(imo
->imo_membership
[i
]);
2849 imo
->imo_membership
[i
] = NULL
;
2851 imo
->imo_num_memberships
= 0;
2852 if (imo
->imo_mfilters
!= NULL
) {
2853 FREE(imo
->imo_mfilters
, M_INMFILTER
);
2854 imo
->imo_mfilters
= NULL
;
2856 if (imo
->imo_membership
!= NULL
) {
2857 FREE(imo
->imo_membership
, M_IPMOPTS
);
2858 imo
->imo_membership
= NULL
;
2862 lck_mtx_destroy(&imo
->imo_lock
, ifa_mtx_grp
);
2864 if (!(imo
->imo_debug
& IFD_ALLOC
)) {
2865 panic("%s: imo %p cannot be freed", __func__
, imo
);
2868 zfree(imo_zone
, imo
);
2872 imo_trace(struct ip_moptions
*imo
, int refhold
)
2874 struct ip_moptions_dbg
*imo_dbg
= (struct ip_moptions_dbg
*)imo
;
2879 if (!(imo
->imo_debug
& IFD_DEBUG
)) {
2880 panic("%s: imo %p has no debug structure", __func__
, imo
);
2884 cnt
= &imo_dbg
->imo_refhold_cnt
;
2885 tr
= imo_dbg
->imo_refhold
;
2887 cnt
= &imo_dbg
->imo_refrele_cnt
;
2888 tr
= imo_dbg
->imo_refrele
;
2891 idx
= atomic_add_16_ov(cnt
, 1) % IMO_TRACE_HIST_SIZE
;
2892 ctrace_record(&tr
[idx
]);
2895 struct ip_moptions
*
2896 ip_allocmoptions(zalloc_flags_t how
)
2898 struct ip_moptions
*imo
;
2900 imo
= zalloc_flags(imo_zone
, how
| Z_ZERO
);
2902 lck_mtx_init(&imo
->imo_lock
, ifa_mtx_grp
, ifa_mtx_attr
);
2903 imo
->imo_debug
|= IFD_ALLOC
;
2904 if (imo_debug
!= 0) {
2905 imo
->imo_debug
|= IFD_DEBUG
;
2906 imo
->imo_trace
= imo_trace
;
2915 * Routine called from ip_output() to loop back a copy of an IP multicast
2916 * packet to the input queue of a specified interface. Note that this
2917 * calls the output routine of the loopback "driver", but with an interface
2918 * pointer that might NOT be a loopback interface -- evil, but easier than
2919 * replicating that code here.
2922 ip_mloopback(struct ifnet
*srcifp
, struct ifnet
*origifp
, struct mbuf
*m
,
2923 struct sockaddr_in
*dst
, int hlen
)
2928 if (lo_ifp
== NULL
) {
2933 * Copy the packet header as it's needed for the checksum
2934 * Make sure to deep-copy IP header portion in case the data
2935 * is in an mbuf cluster, so that we can safely override the IP
2936 * header portion later.
2938 copym
= m_copym_mode(m
, 0, M_COPYALL
, M_DONTWAIT
, M_COPYM_COPY_HDR
);
2939 if (copym
!= NULL
&& ((copym
->m_flags
& M_EXT
) || copym
->m_len
< hlen
)) {
2940 copym
= m_pullup(copym
, hlen
);
2943 if (copym
== NULL
) {
2948 * We don't bother to fragment if the IP length is greater
2949 * than the interface's MTU. Can this possibly matter?
2951 ip
= mtod(copym
, struct ip
*);
2952 #if BYTE_ORDER != BIG_ENDIAN
2957 ip
->ip_sum
= ip_cksum_hdr_out(copym
, hlen
);
2960 * Mark checksum as valid unless receive checksum offload is
2961 * disabled; if so, compute checksum in software. If the
2962 * interface itself is lo0, this will be overridden by if_loop.
2965 copym
->m_pkthdr
.csum_flags
&= ~(CSUM_PARTIAL
| CSUM_ZERO_INVERT
);
2966 copym
->m_pkthdr
.csum_flags
|=
2967 CSUM_DATA_VALID
| CSUM_PSEUDO_HDR
;
2968 copym
->m_pkthdr
.csum_data
= 0xffff;
2969 } else if (copym
->m_pkthdr
.csum_flags
& CSUM_DELAY_DATA
) {
2970 #if BYTE_ORDER != BIG_ENDIAN
2973 in_delayed_cksum(copym
);
2974 #if BYTE_ORDER != BIG_ENDIAN
2980 * Stuff the 'real' ifp into the pkthdr, to be used in matching
2981 * in ip_input(); we need the loopback ifp/dl_tag passed as args
2982 * to make the loopback driver compliant with the data link
2985 copym
->m_pkthdr
.rcvif
= origifp
;
2988 * Also record the source interface (which owns the source address).
2989 * This is basically a stripped down version of ifa_foraddr().
2991 if (srcifp
== NULL
) {
2992 struct in_ifaddr
*ia
;
2994 lck_rw_lock_shared(in_ifaddr_rwlock
);
2995 TAILQ_FOREACH(ia
, INADDR_HASH(ip
->ip_src
.s_addr
), ia_hash
) {
2996 IFA_LOCK_SPIN(&ia
->ia_ifa
);
2997 if (IA_SIN(ia
)->sin_addr
.s_addr
== ip
->ip_src
.s_addr
) {
2998 srcifp
= ia
->ia_ifp
;
2999 IFA_UNLOCK(&ia
->ia_ifa
);
3002 IFA_UNLOCK(&ia
->ia_ifa
);
3004 lck_rw_done(in_ifaddr_rwlock
);
3006 if (srcifp
!= NULL
) {
3007 ip_setsrcifaddr_info(copym
, srcifp
->if_index
, NULL
);
3009 ip_setdstifaddr_info(copym
, origifp
->if_index
, NULL
);
3011 dlil_output(lo_ifp
, PF_INET
, copym
, NULL
, SA(dst
), 0, NULL
);
3015 * Given a source IP address (and route, if available), determine the best
3016 * interface to send the packet from. Checking for (and updating) the
3017 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3018 * without any locks based on the assumption that ip_output() is single-
3019 * threaded per-pcb, i.e. for any given pcb there can only be one thread
3020 * performing output at the IP layer.
3022 * This routine is analogous to in6_selectroute() for IPv6.
3024 static struct ifaddr
*
3025 in_selectsrcif(struct ip
*ip
, struct route
*ro
, unsigned int ifscope
)
3027 struct ifaddr
*ifa
= NULL
;
3028 struct in_addr src
= ip
->ip_src
;
3029 struct in_addr dst
= ip
->ip_dst
;
3030 struct ifnet
*rt_ifp
;
3031 char s_src
[MAX_IPv4_STR_LEN
], s_dst
[MAX_IPv4_STR_LEN
];
3033 VERIFY(src
.s_addr
!= INADDR_ANY
);
3035 if (ip_select_srcif_debug
) {
3036 (void) inet_ntop(AF_INET
, &src
.s_addr
, s_src
, sizeof(s_src
));
3037 (void) inet_ntop(AF_INET
, &dst
.s_addr
, s_dst
, sizeof(s_dst
));
3040 if (ro
->ro_rt
!= NULL
) {
3044 rt_ifp
= (ro
->ro_rt
!= NULL
) ? ro
->ro_rt
->rt_ifp
: NULL
;
3047 * Given the source IP address, find a suitable source interface
3048 * to use for transmission; if the caller has specified a scope,
3049 * optimize the search by looking at the addresses only for that
3050 * interface. This is still suboptimal, however, as we need to
3051 * traverse the per-interface list.
3053 if (ifscope
!= IFSCOPE_NONE
|| ro
->ro_rt
!= NULL
) {
3054 unsigned int scope
= ifscope
;
3057 * If no scope is specified and the route is stale (pointing
3058 * to a defunct interface) use the current primary interface;
3059 * this happens when switching between interfaces configured
3060 * with the same IP address. Otherwise pick up the scope
3061 * information from the route; the ULP may have looked up a
3062 * correct route and we just need to verify it here and mark
3063 * it with the ROF_SRCIF_SELECTED flag below.
3065 if (scope
== IFSCOPE_NONE
) {
3066 scope
= rt_ifp
->if_index
;
3067 if (scope
!= get_primary_ifscope(AF_INET
) &&
3068 ROUTE_UNUSABLE(ro
)) {
3069 scope
= get_primary_ifscope(AF_INET
);
3073 ifa
= (struct ifaddr
*)ifa_foraddr_scoped(src
.s_addr
, scope
);
3075 if (ifa
== NULL
&& ip
->ip_p
!= IPPROTO_UDP
&&
3076 ip
->ip_p
!= IPPROTO_TCP
&& ipforwarding
) {
3078 * If forwarding is enabled, and if the packet isn't
3079 * TCP or UDP, check if the source address belongs
3080 * to one of our own interfaces; if so, demote the
3081 * interface scope and do a route lookup right below.
3083 ifa
= (struct ifaddr
*)ifa_foraddr(src
.s_addr
);
3087 ifscope
= IFSCOPE_NONE
;
3091 if (ip_select_srcif_debug
&& ifa
!= NULL
) {
3092 if (ro
->ro_rt
!= NULL
) {
3093 printf("%s->%s ifscope %d->%d ifa_if %s "
3094 "ro_if %s\n", s_src
, s_dst
, ifscope
,
3095 scope
, if_name(ifa
->ifa_ifp
),
3098 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3099 s_src
, s_dst
, ifscope
, scope
,
3100 if_name(ifa
->ifa_ifp
));
3106 * Slow path; search for an interface having the corresponding source
3107 * IP address if the scope was not specified by the caller, and:
3109 * 1) There currently isn't any route, or,
3110 * 2) The interface used by the route does not own that source
3111 * IP address; in this case, the route will get blown away
3112 * and we'll do a more specific scoped search using the newly
3115 if (ifa
== NULL
&& ifscope
== IFSCOPE_NONE
) {
3116 ifa
= (struct ifaddr
*)ifa_foraddr(src
.s_addr
);
3119 * If we have the IP address, but not the route, we don't
3120 * really know whether or not it belongs to the correct
3121 * interface (it could be shared across multiple interfaces.)
3122 * The only way to find out is to do a route lookup.
3124 if (ifa
!= NULL
&& ro
->ro_rt
== NULL
) {
3126 struct sockaddr_in sin
;
3127 struct ifaddr
*oifa
= NULL
;
3129 bzero(&sin
, sizeof(sin
));
3130 sin
.sin_family
= AF_INET
;
3131 sin
.sin_len
= sizeof(sin
);
3134 lck_mtx_lock(rnh_lock
);
3135 if ((rt
= rt_lookup(TRUE
, SA(&sin
), NULL
,
3136 rt_tables
[AF_INET
], IFSCOPE_NONE
)) != NULL
) {
3139 * If the route uses a different interface,
3140 * use that one instead. The IP address of
3141 * the ifaddr that we pick up here is not
3144 if (ifa
->ifa_ifp
!= rt
->rt_ifp
) {
3154 lck_mtx_unlock(rnh_lock
);
3157 struct ifaddr
*iifa
;
3160 * See if the interface pointed to by the
3161 * route is configured with the source IP
3162 * address of the packet.
3164 iifa
= (struct ifaddr
*)ifa_foraddr_scoped(
3165 src
.s_addr
, ifa
->ifa_ifp
->if_index
);
3169 * Found it; drop the original one
3170 * as well as the route interface
3171 * address, and use this instead.
3176 } else if (!ipforwarding
||
3177 (rt
->rt_flags
& RTF_GATEWAY
)) {
3179 * This interface doesn't have that
3180 * source IP address; drop the route
3181 * interface address and just use the
3182 * original one, and let the caller
3183 * do a scoped route lookup.
3189 * Forwarding is enabled and the source
3190 * address belongs to one of our own
3191 * interfaces which isn't the outgoing
3192 * interface, and we have a route, and
3193 * the destination is on a network that
3194 * is directly attached (onlink); drop
3195 * the original one and use the route
3196 * interface address instead.
3201 } else if (ifa
!= NULL
&& ro
->ro_rt
!= NULL
&&
3202 !(ro
->ro_rt
->rt_flags
& RTF_GATEWAY
) &&
3203 ifa
->ifa_ifp
!= ro
->ro_rt
->rt_ifp
&& ipforwarding
) {
3205 * Forwarding is enabled and the source address belongs
3206 * to one of our own interfaces which isn't the same
3207 * as the interface used by the known route; drop the
3208 * original one and use the route interface address.
3211 ifa
= ro
->ro_rt
->rt_ifa
;
3215 if (ip_select_srcif_debug
&& ifa
!= NULL
) {
3216 printf("%s->%s ifscope %d ifa_if %s\n",
3217 s_src
, s_dst
, ifscope
, if_name(ifa
->ifa_ifp
));
3221 if (ro
->ro_rt
!= NULL
) {
3222 RT_LOCK_ASSERT_HELD(ro
->ro_rt
);
3225 * If there is a non-loopback route with the wrong interface, or if
3226 * there is no interface configured with such an address, blow it
3227 * away. Except for local/loopback, we look for one with a matching
3228 * interface scope/index.
3230 if (ro
->ro_rt
!= NULL
&&
3231 (ifa
== NULL
|| (ifa
->ifa_ifp
!= rt_ifp
&& rt_ifp
!= lo_ifp
) ||
3232 !(ro
->ro_rt
->rt_flags
& RTF_UP
))) {
3233 if (ip_select_srcif_debug
) {
3235 printf("%s->%s ifscope %d ro_if %s != "
3236 "ifa_if %s (cached route cleared)\n",
3237 s_src
, s_dst
, ifscope
, if_name(rt_ifp
),
3238 if_name(ifa
->ifa_ifp
));
3240 printf("%s->%s ifscope %d ro_if %s "
3241 "(no ifa_if found)\n",
3242 s_src
, s_dst
, ifscope
, if_name(rt_ifp
));
3246 RT_UNLOCK(ro
->ro_rt
);
3250 * If the destination is IPv4 LLA and the route's interface
3251 * doesn't match the source interface, then the source IP
3252 * address is wrong; it most likely belongs to the primary
3253 * interface associated with the IPv4 LL subnet. Drop the
3254 * packet rather than letting it go out and return an error
3255 * to the ULP. This actually applies not only to IPv4 LL
3256 * but other shared subnets; for now we explicitly test only
3257 * for the former case and save the latter for future.
3259 if (IN_LINKLOCAL(ntohl(dst
.s_addr
)) &&
3260 !IN_LINKLOCAL(ntohl(src
.s_addr
)) && ifa
!= NULL
) {
3266 if (ip_select_srcif_debug
&& ifa
== NULL
) {
3267 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3268 s_src
, s_dst
, ifscope
);
3272 * If there is a route, mark it accordingly. If there isn't one,
3273 * we'll get here again during the next transmit (possibly with a
3274 * route) and the flag will get set at that point. For IPv4 LLA
3275 * destination, mark it only if the route has been fully resolved;
3276 * otherwise we want to come back here again when the route points
3277 * to the interface over which the ARP reply arrives on.
3279 if (ro
->ro_rt
!= NULL
&& (!IN_LINKLOCAL(ntohl(dst
.s_addr
)) ||
3280 (ro
->ro_rt
->rt_gateway
->sa_family
== AF_LINK
&&
3281 SDL(ro
->ro_rt
->rt_gateway
)->sdl_alen
!= 0))) {
3283 IFA_ADDREF(ifa
); /* for route */
3285 if (ro
->ro_srcia
!= NULL
) {
3286 IFA_REMREF(ro
->ro_srcia
);
3289 ro
->ro_flags
|= ROF_SRCIF_SELECTED
;
3290 RT_GENID_SYNC(ro
->ro_rt
);
3293 if (ro
->ro_rt
!= NULL
) {
3294 RT_UNLOCK(ro
->ro_rt
);
3301 * @brief Given outgoing interface it determines what checksum needs
3302 * to be computed in software and what needs to be offloaded to the
3305 * @param ifp Pointer to the outgoing interface
3306 * @param m Pointer to the packet
3307 * @param hlen IP header length
3308 * @param ip_len Total packet size i.e. headers + data payload
3309 * @param sw_csum Pointer to a software checksum flag set
3314 ip_output_checksum(struct ifnet
*ifp
, struct mbuf
*m
, int hlen
, int ip_len
,
3317 int tso
= TSO_IPV4_OK(ifp
, m
);
3318 uint32_t hwcap
= ifp
->if_hwassist
;
3320 m
->m_pkthdr
.csum_flags
|= CSUM_IP
;
3323 /* do all in software; hardware checksum offload is disabled */
3324 *sw_csum
= (CSUM_DELAY_DATA
| CSUM_DELAY_IP
) &
3325 m
->m_pkthdr
.csum_flags
;
3327 /* do in software what the hardware cannot */
3328 *sw_csum
= m
->m_pkthdr
.csum_flags
&
3329 ~IF_HWASSIST_CSUM_FLAGS(hwcap
);
3332 if (hlen
!= sizeof(struct ip
)) {
3333 *sw_csum
|= ((CSUM_DELAY_DATA
| CSUM_DELAY_IP
) &
3334 m
->m_pkthdr
.csum_flags
);
3335 } else if (!(*sw_csum
& CSUM_DELAY_DATA
) && (hwcap
& CSUM_PARTIAL
)) {
3336 int interface_mtu
= ifp
->if_mtu
;
3338 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp
)) {
3339 interface_mtu
= IN6_LINKMTU(ifp
);
3340 /* Further adjust the size for CLAT46 expansion */
3341 interface_mtu
-= CLAT46_HDR_EXPANSION_OVERHD
;
3345 * Partial checksum offload, if non-IP fragment, and TCP only
3346 * (no UDP support, as the hardware may not be able to convert
3347 * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3348 * supports "invert zero" capability.)
3350 if (hwcksum_tx
&& !tso
&&
3351 ((m
->m_pkthdr
.csum_flags
& CSUM_TCP
) ||
3352 ((hwcap
& CSUM_ZERO_INVERT
) &&
3353 (m
->m_pkthdr
.csum_flags
& CSUM_ZERO_INVERT
))) &&
3354 ip_len
<= interface_mtu
) {
3355 uint16_t start
= sizeof(struct ip
);
3356 uint16_t ulpoff
= m
->m_pkthdr
.csum_data
& 0xffff;
3357 m
->m_pkthdr
.csum_flags
|=
3358 (CSUM_DATA_VALID
| CSUM_PARTIAL
);
3359 m
->m_pkthdr
.csum_tx_stuff
= (ulpoff
+ start
);
3360 m
->m_pkthdr
.csum_tx_start
= start
;
3361 /* do IP hdr chksum in software */
3362 *sw_csum
= CSUM_DELAY_IP
;
3364 *sw_csum
|= (CSUM_DELAY_DATA
& m
->m_pkthdr
.csum_flags
);
3368 if (*sw_csum
& CSUM_DELAY_DATA
) {
3369 in_delayed_cksum(m
);
3370 *sw_csum
&= ~CSUM_DELAY_DATA
;
3375 * Drop off bits that aren't supported by hardware;
3376 * also make sure to preserve non-checksum related bits.
3378 m
->m_pkthdr
.csum_flags
=
3379 ((m
->m_pkthdr
.csum_flags
&
3380 (IF_HWASSIST_CSUM_FLAGS(hwcap
) | CSUM_DATA_VALID
)) |
3381 (m
->m_pkthdr
.csum_flags
& ~IF_HWASSIST_CSUM_MASK
));
3383 /* drop all bits; hardware checksum offload is disabled */
3384 m
->m_pkthdr
.csum_flags
= 0;
3389 * GRE protocol output for PPP/PPTP
3392 ip_gre_output(struct mbuf
*m
)
3397 bzero(&ro
, sizeof(ro
));
3399 error
= ip_output(m
, NULL
, &ro
, 0, NULL
, NULL
);
3407 sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3409 #pragma unused(arg1, arg2)
3412 i
= ip_output_measure
;
3413 error
= sysctl_handle_int(oidp
, &i
, 0, req
);
3414 if (error
|| req
->newptr
== USER_ADDR_NULL
) {
3418 if (i
< 0 || i
> 1) {
3422 if (ip_output_measure
!= i
&& i
== 1) {
3423 net_perf_initialize(&net_perf
, ip_output_measure_bins
);
3425 ip_output_measure
= i
;
3431 sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3433 #pragma unused(arg1, arg2)
3437 i
= ip_output_measure_bins
;
3438 error
= sysctl_handle_quad(oidp
, &i
, 0, req
);
3439 if (error
|| req
->newptr
== USER_ADDR_NULL
) {
3443 if (!net_perf_validate_bins(i
)) {
3447 ip_output_measure_bins
= i
;
3453 sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3455 #pragma unused(oidp, arg1, arg2)
3456 if (req
->oldptr
== USER_ADDR_NULL
) {
3457 req
->oldlen
= (size_t)sizeof(struct ipstat
);
3460 return SYSCTL_OUT(req
, &net_perf
, MIN(sizeof(net_perf
), req
->oldlen
));