]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/ip_output.c
xnu-2050.7.9.tar.gz
[apple/xnu.git] / bsd / netinet / ip_output.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
61 * $FreeBSD: src/sys/netinet/ip_output.c,v 1.99.2.16 2001/07/19 06:37:26 kris Exp $
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #define _IP_VHL
71
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/kernel.h>
75 #include <sys/malloc.h>
76 #include <sys/mbuf.h>
77 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <kern/locks.h>
81 #include <sys/sysctl.h>
82 #include <sys/mcache.h>
83
84 #include <machine/endian.h>
85 #include <pexpert/pexpert.h>
86
87 #include <net/if.h>
88 #include <net/if_dl.h>
89 #include <net/if_types.h>
90 #include <net/route.h>
91 #include <net/ntstat.h>
92 #include <net/net_osdep.h>
93
94 #include <netinet/in.h>
95 #include <netinet/in_systm.h>
96 #include <netinet/ip.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_var.h>
99 #include <netinet/ip_var.h>
100
101 #include <netinet/kpi_ipfilter_var.h>
102
103 #if CONFIG_MACF_NET
104 #include <security/mac_framework.h>
105 #endif
106
107 #include <net/dlil.h>
108 #include <sys/kdebug.h>
109 #include <libkern/OSAtomic.h>
110
111 #define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1)
112 #define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3)
113 #define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
114 #define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
115
116 #define SWAP16(v) ((((v) & 0xff) << 8) | ((v) >> 8))
117
118 #if IPSEC
119 #include <netinet6/ipsec.h>
120 #include <netkey/key.h>
121 #if IPSEC_DEBUG
122 #include <netkey/key_debug.h>
123 #else
124 #define KEYDEBUG(lev,arg)
125 #endif
126 #endif /*IPSEC*/
127
128 #include <netinet/ip_fw.h>
129 #include <netinet/ip_divert.h>
130 #include <mach/sdt.h>
131
132 #if DUMMYNET
133 #include <netinet/ip_dummynet.h>
134 #endif
135
136 #if PF
137 #include <net/pfvar.h>
138 #endif /* PF */
139
140 #if IPFIREWALL_FORWARD_DEBUG
141 #define print_ip(a) printf("%ld.%ld.%ld.%ld",(ntohl(a.s_addr)>>24)&0xFF,\
142 (ntohl(a.s_addr)>>16)&0xFF,\
143 (ntohl(a.s_addr)>>8)&0xFF,\
144 (ntohl(a.s_addr))&0xFF);
145 #endif
146
147 u_short ip_id;
148
149 static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
150 static void ip_mloopback(struct ifnet *, struct mbuf *,
151 struct sockaddr_in *, int);
152 static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
153 static void imo_trace(struct ip_moptions *, int);
154
155 static void ip_out_cksum_stats(int, u_int32_t);
156 static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
157
158 int ip_optcopy(struct ip *, struct ip *);
159 void in_delayed_cksum_offset(struct mbuf *, int );
160 void in_cksum_offset(struct mbuf* , size_t );
161
162 extern struct protosw inetsw[];
163
164 extern struct ip_linklocal_stat ip_linklocal_stat;
165 extern lck_mtx_t *ip_mutex;
166
167 /* temporary: for testing */
168 #if IPSEC
169 extern int ipsec_bypass;
170 #endif
171
172 static int ip_maxchainsent = 0;
173 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent, CTLFLAG_RW | CTLFLAG_LOCKED,
174 &ip_maxchainsent, 0, "use dlil_output_list");
175 #if DEBUG
176 static int forge_ce = 0;
177 SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW | CTLFLAG_LOCKED,
178 &forge_ce, 0, "Forge ECN CE");
179 #endif /* DEBUG */
180
181 static int ip_select_srcif_debug = 0;
182 SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
183 &ip_select_srcif_debug, 0, "log source interface selection debug info");
184
185 #define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
186
187 /* For gdb */
188 __private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
189
190 struct ip_moptions_dbg {
191 struct ip_moptions imo; /* ip_moptions */
192 u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */
193 u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */
194 /*
195 * Alloc and free callers.
196 */
197 ctrace_t imo_alloc;
198 ctrace_t imo_free;
199 /*
200 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
201 */
202 ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE];
203 ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE];
204 };
205
206 #if DEBUG
207 static unsigned int imo_debug = 1; /* debugging (enabled) */
208 #else
209 static unsigned int imo_debug; /* debugging (disabled) */
210 #endif /* !DEBUG */
211 static unsigned int imo_size; /* size of zone element */
212 static struct zone *imo_zone; /* zone for ip_moptions */
213
214 #define IMO_ZONE_MAX 64 /* maximum elements in zone */
215 #define IMO_ZONE_NAME "ip_moptions" /* zone name */
216
217 /*
218 * IP output. The packet in mbuf chain m contains a skeletal IP
219 * header (with len, off, ttl, proto, tos, src, dst).
220 * The mbuf chain containing the packet will be freed.
221 * The mbuf opt, if present, will not be freed.
222 */
223 int
224 ip_output(
225 struct mbuf *m0,
226 struct mbuf *opt,
227 struct route *ro,
228 int flags,
229 struct ip_moptions *imo,
230 struct ip_out_args *ipoa)
231 {
232 int error;
233 error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
234 return error;
235 }
236
237 /*
238 * Returns: 0 Success
239 * ENOMEM
240 * EADDRNOTAVAIL
241 * ENETUNREACH
242 * EHOSTUNREACH
243 * EACCES
244 * EMSGSIZE
245 * ENOBUFS
246 * ipsec4_getpolicybyaddr:??? [IPSEC 4th argument, contents modified]
247 * ipsec4_getpolicybysock:??? [IPSEC 4th argument, contents modified]
248 * key_spdacquire:??? [IPSEC]
249 * ipsec4_output:??? [IPSEC]
250 * ip_dn_io_ptr:??? [dummynet]
251 * dlil_output:??? [DLIL]
252 * dlil_output_list:??? [DLIL]
253 *
254 * Notes: The ipsec4_getpolicyby{addr|sock} function error returns are
255 * only used as the error return from this function where one of
256 * these functions fails to return a policy.
257 */
258 int
259 ip_output_list(
260 struct mbuf *m0,
261 int packetchain,
262 struct mbuf *opt,
263 struct route *ro,
264 int flags,
265 struct ip_moptions *imo,
266 struct ip_out_args *ipoa)
267 {
268 struct ip *ip;
269 struct ifnet *ifp = NULL;
270 struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
271 int hlen = sizeof (struct ip);
272 int len = 0, error = 0;
273 struct sockaddr_in *dst = NULL;
274 struct in_ifaddr *ia = NULL, *src_ia = NULL;
275 int isbroadcast, sw_csum;
276 struct in_addr pkt_dst;
277 struct ipf_pktopts *ippo = NULL, ipf_pktopts;
278 #if IPSEC
279 struct ipsec_output_state ipsec_state;
280 struct route *ipsec_saved_route = NULL;
281 struct socket *so = NULL;
282 struct secpolicy *sp = NULL;
283 #endif
284 #if IPFIREWALL_FORWARD
285 int fwd_rewrite_src = 0;
286 #endif
287 #if IPFIREWALL
288 int off;
289 struct sockaddr_in *next_hop_from_ipfwd_tag = NULL;
290 #endif
291 #if IPFIREWALL || DUMMYNET
292 struct ip_fw_args args;
293 struct m_tag *tag;
294 #endif
295 int didfilter = 0;
296 ipfilter_t inject_filter_ref = 0;
297 #if DUMMYNET
298 struct route saved_route;
299 struct ip_out_args saved_ipoa;
300 struct sockaddr_in dst_buf;
301 #endif /* DUMMYNET */
302 struct mbuf * packetlist;
303 int pktcnt = 0, tso = 0;
304 u_int32_t bytecnt = 0;
305 unsigned int ifscope = IFSCOPE_NONE;
306 unsigned int nocell = 0;
307 boolean_t select_srcif, srcbound;
308 struct flowadv *adv = NULL;
309
310 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
311
312 #if IPSEC
313 bzero(&ipsec_state, sizeof(ipsec_state));
314 #endif /* IPSEC */
315
316 packetlist = m0;
317 #if IPFIREWALL || DUMMYNET
318 bzero(&args, sizeof(struct ip_fw_args));
319
320 if (SLIST_EMPTY(&m0->m_pkthdr.tags))
321 goto ipfw_tags_done;
322
323 /* Grab info from mtags prepended to the chain */
324 #if DUMMYNET
325 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
326 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
327 struct dn_pkt_tag *dn_tag;
328
329 dn_tag = (struct dn_pkt_tag *)(tag+1);
330 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule;
331 args.fwa_pf_rule = dn_tag->dn_pf_rule;
332 opt = NULL;
333 saved_route = dn_tag->dn_ro;
334 ro = &saved_route;
335
336 imo = NULL;
337 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
338 dst = &dst_buf;
339 ifp = dn_tag->dn_ifp;
340 flags = dn_tag->dn_flags;
341 if ((dn_tag->dn_flags & IP_OUTARGS)) {
342 saved_ipoa = dn_tag->dn_ipoa;
343 ipoa = &saved_ipoa;
344 }
345
346 m_tag_delete(m0, tag);
347 }
348 #endif /* DUMMYNET */
349
350 #if IPDIVERT
351 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
352 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
353 struct divert_tag *div_tag;
354
355 div_tag = (struct divert_tag *)(tag+1);
356 args.fwa_divert_rule = div_tag->cookie;
357
358 m_tag_delete(m0, tag);
359 }
360 #endif /* IPDIVERT */
361
362 #if IPFIREWALL
363 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
364 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
365 struct ip_fwd_tag *ipfwd_tag;
366
367 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
368 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop;
369
370 m_tag_delete(m0, tag);
371 }
372 #endif /* IPFIREWALL */
373
374 ipfw_tags_done:
375 #endif /* IPFIREWALL || DUMMYNET */
376
377 m = m0;
378
379 #if DIAGNOSTIC
380 if ( !m || (m->m_flags & M_PKTHDR) != 0)
381 panic("ip_output no HDR");
382 if (!ro)
383 panic("ip_output no route, proto = %d",
384 mtod(m, struct ip *)->ip_p);
385 #endif
386
387 bzero(&ipf_pktopts, sizeof(struct ipf_pktopts));
388 ippo = &ipf_pktopts;
389
390 if (ip_doscopedroute && (flags & IP_OUTARGS)) {
391 /*
392 * In the forwarding case, only the ifscope value is used,
393 * as source interface selection doesn't take place.
394 */
395 if ((select_srcif = (!(flags & IP_FORWARDING) &&
396 (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
397 ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
398 }
399
400 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
401 ipoa->ipoa_boundif != IFSCOPE_NONE) {
402 ifscope = ipoa->ipoa_boundif;
403 ipf_pktopts.ippo_flags |=
404 (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
405 }
406
407 if ((srcbound = (ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR)))
408 ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
409 } else {
410 select_srcif = FALSE;
411 srcbound = FALSE;
412 ifscope = IFSCOPE_NONE;
413 }
414
415 if ((flags & IP_OUTARGS) && (ipoa->ipoa_flags & IPOAF_NO_CELLULAR)) {
416 nocell = 1;
417 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
418 }
419
420 if (flags & IP_OUTARGS) {
421 adv = &ipoa->ipoa_flowadv;
422 adv->code = FADV_SUCCESS;
423 }
424
425 #if DUMMYNET
426 if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) {
427 /* dummynet already saw us */
428 ip = mtod(m, struct ip *);
429 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
430 pkt_dst = ip->ip_dst;
431 if (ro->ro_rt != NULL) {
432 RT_LOCK_SPIN(ro->ro_rt);
433 ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
434 if (ia) {
435 /* Become a regular mutex */
436 RT_CONVERT_LOCK(ro->ro_rt);
437 IFA_ADDREF(&ia->ia_ifa);
438 }
439 RT_UNLOCK(ro->ro_rt);
440 }
441 #if IPSEC
442 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
443 so = ipsec_getsocket(m);
444 (void)ipsec_setsocket(m, NULL);
445 }
446 #endif /* IPSEC */
447 #if IPFIREWALL
448 if (args.fwa_ipfw_rule != NULL)
449 goto skip_ipsec;
450 #endif /* #if IPFIREWALL */
451 if (args.fwa_pf_rule != NULL)
452 goto sendit;
453 }
454 #endif /* DUMMYNET */
455
456 #if IPSEC
457 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
458 so = ipsec_getsocket(m);
459 (void)ipsec_setsocket(m, NULL);
460 }
461 #endif
462 loopit:
463 /*
464 * No need to proccess packet twice if we've
465 * already seen it
466 */
467 if (!SLIST_EMPTY(&m->m_pkthdr.tags))
468 inject_filter_ref = ipf_get_inject_filter(m);
469 else
470 inject_filter_ref = 0;
471
472 if (opt) {
473 m = ip_insertoptions(m, opt, &len);
474 hlen = len;
475 /* Update the chain */
476 if (m != m0) {
477 if (m0 == packetlist)
478 packetlist = m;
479 m0 = m;
480 }
481 }
482 ip = mtod(m, struct ip *);
483 #if IPFIREWALL
484 /*
485 * rdar://8542331
486 *
487 * When dealing with a packet chain, we need to reset "next_hop" because
488 * "dst" may have been changed to the gateway address below for the previous
489 * packet of the chain. This could cause the route to be inavertandly changed
490 * to the route to the gateway address (instead of the route to the destination).
491 */
492 args.fwa_next_hop = next_hop_from_ipfwd_tag;
493 pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst;
494 #else
495 pkt_dst = ip->ip_dst;
496 #endif
497
498 /*
499 * We must not send if the packet is destined to network zero.
500 * RFC1122 3.2.1.3 (a) and (b).
501 */
502 if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
503 error = EHOSTUNREACH;
504 goto bad;
505 }
506
507 /*
508 * Fill in IP header.
509 */
510 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
511 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
512 ip->ip_off &= IP_DF;
513 #if RANDOM_IP_ID
514 ip->ip_id = ip_randomid();
515 #else
516 ip->ip_id = htons(ip_id++);
517 #endif
518 OSAddAtomic(1, &ipstat.ips_localout);
519 } else {
520 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
521 }
522
523 #if DEBUG
524 /* For debugging, we let the stack forge congestion */
525 if (forge_ce != 0 &&
526 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
527 (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
528 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
529 forge_ce--;
530 }
531 #endif /* DEBUG */
532
533 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr,
534 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
535
536 dst = (struct sockaddr_in *)(void *)&ro->ro_dst;
537
538 /*
539 * If there is a cached route,
540 * check that it is to the same destination
541 * and is still up. If not, free it and try again.
542 * The address family should also be checked in case of sharing the
543 * cache with IPv6.
544 */
545
546 if (ro->ro_rt != NULL) {
547 if (ro->ro_rt->generation_id != route_generation &&
548 ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0) &&
549 (ip->ip_src.s_addr != INADDR_ANY)) {
550 src_ia = ifa_foraddr(ip->ip_src.s_addr);
551 if (src_ia == NULL) {
552 error = EADDRNOTAVAIL;
553 goto bad;
554 }
555 IFA_REMREF(&src_ia->ia_ifa);
556 }
557 /*
558 * Test rt_flags without holding rt_lock for performance
559 * reasons; if the route is down it will hopefully be
560 * caught by the layer below (since it uses this route
561 * as a hint) or during the next transmit.
562 */
563 if ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
564 dst->sin_family != AF_INET ||
565 dst->sin_addr.s_addr != pkt_dst.s_addr) {
566 rtfree(ro->ro_rt);
567 ro->ro_rt = NULL;
568 }
569 /*
570 * If we're doing source interface selection, we may not
571 * want to use this route; only synch up the generation
572 * count otherwise.
573 */
574 if (!select_srcif && ro->ro_rt != NULL &&
575 ro->ro_rt->generation_id != route_generation)
576 ro->ro_rt->generation_id = route_generation;
577 }
578 if (ro->ro_rt == NULL) {
579 bzero(dst, sizeof(*dst));
580 dst->sin_family = AF_INET;
581 dst->sin_len = sizeof(*dst);
582 dst->sin_addr = pkt_dst;
583 }
584 /*
585 * If routing to interface only,
586 * short circuit routing lookup.
587 */
588 if (flags & IP_ROUTETOIF) {
589 if (ia)
590 IFA_REMREF(&ia->ia_ifa);
591 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == 0) {
592 if ((ia = ifatoia(ifa_ifwithnet(sintosa(dst)))) == 0) {
593 OSAddAtomic(1, &ipstat.ips_noroute);
594 error = ENETUNREACH;
595 goto bad;
596 }
597 }
598 ifp = ia->ia_ifp;
599 ip->ip_ttl = 1;
600 isbroadcast = in_broadcast(dst->sin_addr, ifp);
601 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
602 imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
603 /*
604 * Bypass the normal routing lookup for multicast
605 * packets if the interface is specified.
606 */
607 isbroadcast = 0;
608 if (ia != NULL)
609 IFA_REMREF(&ia->ia_ifa);
610
611 /* Macro takes reference on ia */
612 IFP_TO_IA(ifp, ia);
613 } else {
614 boolean_t cloneok = FALSE;
615 /*
616 * Perform source interface selection; the source IP address
617 * must belong to one of the addresses of the interface used
618 * by the route. For performance reasons, do this only if
619 * there is no route, or if the routing table has changed,
620 * or if we haven't done source interface selection on this
621 * route (for this PCB instance) before.
622 */
623 if (select_srcif && ip->ip_src.s_addr != INADDR_ANY &&
624 (ro->ro_rt == NULL || !(ro->ro_rt->rt_flags & RTF_UP) ||
625 ro->ro_rt->generation_id != route_generation ||
626 !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
627 struct ifaddr *ifa;
628
629 /* Find the source interface */
630 ifa = in_selectsrcif(ip, ro, ifscope);
631
632 /*
633 * If the source address belongs to a cellular interface
634 * and the caller forbids our using interfaces of such
635 * type, pretend that there is no source address.
636 */
637 if (nocell && ifa != NULL &&
638 ifa->ifa_ifp->if_type == IFT_CELLULAR) {
639 IFA_REMREF(ifa);
640 error = EADDRNOTAVAIL;
641 goto bad;
642 }
643
644 /*
645 * If the source address is spoofed (in the case of
646 * IP_RAWOUTPUT on an unbounded socket), or if this
647 * is destined for local/loopback, just let it go out
648 * using the interface of the route. Otherwise,
649 * there's no interface having such an address,
650 * so bail out.
651 */
652 if (ifa == NULL && (!(flags & IP_RAWOUTPUT) ||
653 srcbound) && ifscope != lo_ifp->if_index) {
654 error = EADDRNOTAVAIL;
655 goto bad;
656 }
657
658 /*
659 * If the caller didn't explicitly specify the scope,
660 * pick it up from the source interface. If the cached
661 * route was wrong and was blown away as part of source
662 * interface selection, don't mask out RTF_PRCLONING
663 * since that route may have been allocated by the ULP,
664 * unless the IP header was created by the caller or
665 * the destination is IPv4 LLA. The check for the
666 * latter is needed because IPv4 LLAs are never scoped
667 * in the current implementation, and we don't want to
668 * replace the resolved IPv4 LLA route with one whose
669 * gateway points to that of the default gateway on
670 * the primary interface of the system.
671 */
672 if (ifa != NULL) {
673 if (ifscope == IFSCOPE_NONE)
674 ifscope = ifa->ifa_ifp->if_index;
675 IFA_REMREF(ifa);
676 cloneok = (!(flags & IP_RAWOUTPUT) &&
677 !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
678 }
679 }
680
681 /*
682 * If this is the case, we probably don't want to allocate
683 * a protocol-cloned route since we didn't get one from the
684 * ULP. This lets TCP do its thing, while not burdening
685 * forwarding or ICMP with the overhead of cloning a route.
686 * Of course, we still want to do any cloning requested by
687 * the link layer, as this is probably required in all cases
688 * for correct operation (as it is for ARP).
689 */
690 if (ro->ro_rt == NULL) {
691 unsigned long ign = RTF_PRCLONING;
692 /*
693 * We make an exception here: if the destination
694 * address is INADDR_BROADCAST, allocate a protocol-
695 * cloned host route so that we end up with a route
696 * marked with the RTF_BROADCAST flag. Otherwise,
697 * we would end up referring to the default route,
698 * instead of creating a cloned host route entry.
699 * That would introduce inconsistencies between ULPs
700 * that allocate a route and those that don't. The
701 * RTF_BROADCAST route is important since we'd want
702 * to send out undirected IP broadcast packets using
703 * link-level broadcast address. Another exception
704 * is for ULP-created routes that got blown away by
705 * source interface selection (see above).
706 *
707 * These exceptions will no longer be necessary when
708 * the RTF_PRCLONING scheme is no longer present.
709 */
710 if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
711 ign &= ~RTF_PRCLONING;
712
713 /*
714 * Loosen the route lookup criteria if the ifscope
715 * corresponds to the loopback interface; this is
716 * needed to support Application Layer Gateways
717 * listening on loopback, in conjunction with packet
718 * filter redirection rules. The final source IP
719 * address will be rewritten by the packet filter
720 * prior to the RFC1122 loopback check below.
721 */
722 if (ifscope == lo_ifp->if_index)
723 rtalloc_ign(ro, ign);
724 else
725 rtalloc_scoped_ign(ro, ign, ifscope);
726
727 /*
728 * If the route points to a cellular interface and the
729 * caller forbids our using interfaces of such type,
730 * pretend that there is no route.
731 */
732 if (nocell && ro->ro_rt != NULL) {
733 RT_LOCK_SPIN(ro->ro_rt);
734 if (ro->ro_rt->rt_ifp->if_type ==
735 IFT_CELLULAR) {
736 RT_UNLOCK(ro->ro_rt);
737 rtfree(ro->ro_rt);
738 ro->ro_rt = NULL;
739 } else {
740 RT_UNLOCK(ro->ro_rt);
741 }
742 }
743 }
744
745 if (ro->ro_rt == NULL) {
746 OSAddAtomic(1, &ipstat.ips_noroute);
747 error = EHOSTUNREACH;
748 goto bad;
749 }
750
751 if (ia)
752 IFA_REMREF(&ia->ia_ifa);
753 RT_LOCK_SPIN(ro->ro_rt);
754 ia = ifatoia(ro->ro_rt->rt_ifa);
755 if (ia) {
756 /* Become a regular mutex */
757 RT_CONVERT_LOCK(ro->ro_rt);
758 IFA_ADDREF(&ia->ia_ifa);
759 }
760 ifp = ro->ro_rt->rt_ifp;
761 ro->ro_rt->rt_use++;
762 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
763 dst = (struct sockaddr_in *)(void *)
764 ro->ro_rt->rt_gateway;
765 }
766 if (ro->ro_rt->rt_flags & RTF_HOST) {
767 isbroadcast = (ro->ro_rt->rt_flags & RTF_BROADCAST);
768 } else {
769 /* Become a regular mutex */
770 RT_CONVERT_LOCK(ro->ro_rt);
771 isbroadcast = in_broadcast(dst->sin_addr, ifp);
772 }
773 RT_UNLOCK(ro->ro_rt);
774 }
775
776 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
777 struct in_multi *inm;
778 u_int32_t vif;
779 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
780 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
781
782 m->m_flags |= M_MCAST;
783 /*
784 * IP destination address is multicast. Make sure "dst"
785 * still points to the address in "ro". (It may have been
786 * changed to point to a gateway address, above.)
787 */
788 dst = (struct sockaddr_in *)(void *)&ro->ro_dst;
789 /*
790 * See if the caller provided any multicast options
791 */
792 if (imo != NULL) {
793 IMO_LOCK(imo);
794 vif = imo->imo_multicast_vif;
795 ttl = imo->imo_multicast_ttl;
796 loop = imo->imo_multicast_loop;
797 if ((flags & IP_RAWOUTPUT) == 0)
798 ip->ip_ttl = ttl;
799 if (imo->imo_multicast_ifp != NULL)
800 ifp = imo->imo_multicast_ifp;
801 IMO_UNLOCK(imo);
802 #if MROUTING
803 if (vif != -1 && ((flags & IP_RAWOUTPUT) == 0 ||
804 ip->ip_src.s_addr == INADDR_ANY))
805 ip->ip_src.s_addr = ip_mcast_src(vif);
806 #endif /* MROUTING */
807 } else if ((flags & IP_RAWOUTPUT) == 0) {
808 vif = -1;
809 ip->ip_ttl = ttl;
810 }
811 /*
812 * Confirm that the outgoing interface supports multicast.
813 */
814 if (imo == NULL || vif == -1) {
815 if ((ifp->if_flags & IFF_MULTICAST) == 0) {
816 OSAddAtomic(1, &ipstat.ips_noroute);
817 error = ENETUNREACH;
818 goto bad;
819 }
820 }
821 /*
822 * If source address not specified yet, use address
823 * of outgoing interface.
824 */
825 if (ip->ip_src.s_addr == INADDR_ANY) {
826 struct in_ifaddr *ia1;
827 lck_rw_lock_shared(in_ifaddr_rwlock);
828 TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
829 IFA_LOCK_SPIN(&ia1->ia_ifa);
830 if (ia1->ia_ifp == ifp) {
831 ip->ip_src = IA_SIN(ia1)->sin_addr;
832 IFA_UNLOCK(&ia1->ia_ifa);
833 break;
834 }
835 IFA_UNLOCK(&ia1->ia_ifa);
836 }
837 lck_rw_done(in_ifaddr_rwlock);
838 if (ip->ip_src.s_addr == INADDR_ANY) {
839 error = ENETUNREACH;
840 goto bad;
841 }
842 }
843
844 in_multihead_lock_shared();
845 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
846 in_multihead_lock_done();
847 if (inm != NULL && (imo == NULL || loop)) {
848 /*
849 * If we belong to the destination multicast group
850 * on the outgoing interface, and the caller did not
851 * forbid loopback, loop back a copy.
852 */
853 if (!TAILQ_EMPTY(&ipv4_filters)) {
854 struct ipfilter *filter;
855 int seen = (inject_filter_ref == 0);
856
857 if (imo != NULL) {
858 ipf_pktopts.ippo_flags |= IPPOF_MCAST_OPTS;
859 ipf_pktopts.ippo_mcast_ifnet = ifp;
860 ipf_pktopts.ippo_mcast_ttl = ttl;
861 ipf_pktopts.ippo_mcast_loop = loop;
862 }
863
864 ipf_ref();
865
866 /* 4135317 - always pass network byte order to filter */
867
868 #if BYTE_ORDER != BIG_ENDIAN
869 HTONS(ip->ip_len);
870 HTONS(ip->ip_off);
871 #endif
872
873 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
874 if (seen == 0) {
875 if ((struct ipfilter *)inject_filter_ref == filter)
876 seen = 1;
877 } else if (filter->ipf_filter.ipf_output) {
878 errno_t result;
879 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
880 if (result == EJUSTRETURN) {
881 ipf_unref();
882 INM_REMREF(inm);
883 goto done;
884 }
885 if (result != 0) {
886 ipf_unref();
887 INM_REMREF(inm);
888 goto bad;
889 }
890 }
891 }
892
893 /* set back to host byte order */
894 ip = mtod(m, struct ip *);
895
896 #if BYTE_ORDER != BIG_ENDIAN
897 NTOHS(ip->ip_len);
898 NTOHS(ip->ip_off);
899 #endif
900
901 ipf_unref();
902 didfilter = 1;
903 }
904 ip_mloopback(ifp, m, dst, hlen);
905 }
906 #if MROUTING
907 else {
908 /*
909 * If we are acting as a multicast router, perform
910 * multicast forwarding as if the packet had just
911 * arrived on the interface to which we are about
912 * to send. The multicast forwarding function
913 * recursively calls this function, using the
914 * IP_FORWARDING flag to prevent infinite recursion.
915 *
916 * Multicasts that are looped back by ip_mloopback(),
917 * above, will be forwarded by the ip_input() routine,
918 * if necessary.
919 */
920 if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
921 /*
922 * Check if rsvp daemon is running. If not, don't
923 * set ip_moptions. This ensures that the packet
924 * is multicast and not just sent down one link
925 * as prescribed by rsvpd.
926 */
927 if (!rsvp_on)
928 imo = NULL;
929 if (ip_mforward(ip, ifp, m, imo) != 0) {
930 m_freem(m);
931 if (inm != NULL)
932 INM_REMREF(inm);
933 OSAddAtomic(1, &ipstat.ips_cantforward);
934 goto done;
935 }
936 }
937 }
938 #endif /* MROUTING */
939 if (inm != NULL)
940 INM_REMREF(inm);
941 /*
942 * Multicasts with a time-to-live of zero may be looped-
943 * back, above, but must not be transmitted on a network.
944 * Also, multicasts addressed to the loopback interface
945 * are not sent -- the above call to ip_mloopback() will
946 * loop back a copy if this host actually belongs to the
947 * destination group on the loopback interface.
948 */
949 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
950 m_freem(m);
951 goto done;
952 }
953
954 goto sendit;
955 }
956 /*
957 * If source address not specified yet, use address
958 * of outgoing interface.
959 */
960 if (ip->ip_src.s_addr == INADDR_ANY) {
961 IFA_LOCK_SPIN(&ia->ia_ifa);
962 ip->ip_src = IA_SIN(ia)->sin_addr;
963 IFA_UNLOCK(&ia->ia_ifa);
964 #if IPFIREWALL_FORWARD
965 /* Keep note that we did this - if the firewall changes
966 * the next-hop, our interface may change, changing the
967 * default source IP. It's a shame so much effort happens
968 * twice. Oh well.
969 */
970 fwd_rewrite_src++;
971 #endif /* IPFIREWALL_FORWARD */
972 }
973
974 /*
975 * Look for broadcast address and
976 * and verify user is allowed to send
977 * such a packet.
978 */
979 if (isbroadcast) {
980 if ((ifp->if_flags & IFF_BROADCAST) == 0) {
981 error = EADDRNOTAVAIL;
982 goto bad;
983 }
984 if ((flags & IP_ALLOWBROADCAST) == 0) {
985 error = EACCES;
986 goto bad;
987 }
988 /* don't allow broadcast messages to be fragmented */
989 if ((u_short)ip->ip_len > ifp->if_mtu) {
990 error = EMSGSIZE;
991 goto bad;
992 }
993 m->m_flags |= M_BCAST;
994 } else {
995 m->m_flags &= ~M_BCAST;
996 }
997
998 sendit:
999 #if PF
1000 /* Invoke outbound packet filter */
1001 if (PF_IS_ENABLED) {
1002 int rc;
1003
1004 m0 = m; /* Save for later */
1005 #if DUMMYNET
1006 args.fwa_m = m;
1007 args.fwa_next_hop = dst;
1008 args.fwa_oif = ifp;
1009 args.fwa_ro = ro;
1010 args.fwa_dst = dst;
1011 args.fwa_oflags = flags;
1012 if (flags & IP_OUTARGS)
1013 args.fwa_ipoa = ipoa;
1014 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args);
1015 #else /* DUMMYNET */
1016 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1017 #endif /* DUMMYNET */
1018 if (rc != 0 || m == NULL) {
1019 /* Move to the next packet */
1020 m = *mppn;
1021
1022 /* Skip ahead if first packet in list got dropped */
1023 if (packetlist == m0)
1024 packetlist = m;
1025
1026 if (m != NULL) {
1027 m0 = m;
1028 /* Next packet in the chain */
1029 goto loopit;
1030 } else if (packetlist != NULL) {
1031 /* No more packet; send down the chain */
1032 goto sendchain;
1033 }
1034 /* Nothing left; we're done */
1035 goto done;
1036 }
1037 m0 = m;
1038 ip = mtod(m, struct ip *);
1039 pkt_dst = ip->ip_dst;
1040 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1041 }
1042 #endif /* PF */
1043 /*
1044 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1045 */
1046 if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) || IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1047 ip_linklocal_stat.iplls_out_total++;
1048 if (ip->ip_ttl != MAXTTL) {
1049 ip_linklocal_stat.iplls_out_badttl++;
1050 ip->ip_ttl = MAXTTL;
1051 }
1052 }
1053
1054 if (!didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
1055 struct ipfilter *filter;
1056 int seen = (inject_filter_ref == 0);
1057 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1058
1059 /* Check that a TSO frame isn't passed to a filter.
1060 * This could happen if a filter is inserted while
1061 * TCP is sending the TSO packet.
1062 */
1063 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1064 error = EMSGSIZE;
1065 goto bad;
1066 }
1067
1068 ipf_ref();
1069
1070 /* 4135317 - always pass network byte order to filter */
1071
1072 #if BYTE_ORDER != BIG_ENDIAN
1073 HTONS(ip->ip_len);
1074 HTONS(ip->ip_off);
1075 #endif
1076
1077 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1078 if (seen == 0) {
1079 if ((struct ipfilter *)inject_filter_ref == filter)
1080 seen = 1;
1081 } else if (filter->ipf_filter.ipf_output) {
1082 errno_t result;
1083 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
1084 if (result == EJUSTRETURN) {
1085 ipf_unref();
1086 goto done;
1087 }
1088 if (result != 0) {
1089 ipf_unref();
1090 goto bad;
1091 }
1092 }
1093 }
1094
1095 /* set back to host byte order */
1096 ip = mtod(m, struct ip *);
1097
1098 #if BYTE_ORDER != BIG_ENDIAN
1099 NTOHS(ip->ip_len);
1100 NTOHS(ip->ip_off);
1101 #endif
1102
1103 ipf_unref();
1104 }
1105
1106 #if IPSEC
1107 /* temporary for testing only: bypass ipsec alltogether */
1108
1109 if (ipsec_bypass != 0 || (flags & IP_NOIPSEC) != 0)
1110 goto skip_ipsec;
1111
1112 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0,0,0,0,0);
1113
1114
1115 /* get SP for this packet */
1116 if (so == NULL)
1117 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND, flags, &error);
1118 else
1119 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND, so, &error);
1120
1121 if (sp == NULL) {
1122 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1123 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 0,0,0,0,0);
1124 goto bad;
1125 }
1126
1127 error = 0;
1128
1129 /* check policy */
1130 switch (sp->policy) {
1131 case IPSEC_POLICY_DISCARD:
1132 case IPSEC_POLICY_GENERATE:
1133 /*
1134 * This packet is just discarded.
1135 */
1136 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1137 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 1,0,0,0,0);
1138 goto bad;
1139
1140 case IPSEC_POLICY_BYPASS:
1141 case IPSEC_POLICY_NONE:
1142 /* no need to do IPsec. */
1143 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 2,0,0,0,0);
1144 goto skip_ipsec;
1145
1146 case IPSEC_POLICY_IPSEC:
1147 if (sp->req == NULL) {
1148 /* acquire a policy */
1149 error = key_spdacquire(sp);
1150 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 3,0,0,0,0);
1151 goto bad;
1152 }
1153 break;
1154
1155 case IPSEC_POLICY_ENTRUST:
1156 default:
1157 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1158 }
1159 {
1160 ipsec_state.m = m;
1161 if (flags & IP_ROUTETOIF) {
1162 bzero(&ipsec_state.ro, sizeof(ipsec_state.ro));
1163 } else
1164 route_copyout(&ipsec_state.ro, ro, sizeof(ipsec_state.ro));
1165 ipsec_state.dst = (struct sockaddr *)dst;
1166
1167 ip->ip_sum = 0;
1168
1169 /*
1170 * XXX
1171 * delayed checksums are not currently compatible with IPsec
1172 */
1173 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1174 in_delayed_cksum(m);
1175 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1176 }
1177
1178
1179 #if BYTE_ORDER != BIG_ENDIAN
1180 HTONS(ip->ip_len);
1181 HTONS(ip->ip_off);
1182 #endif
1183
1184 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1185 struct ip *, ip, struct ifnet *, ifp,
1186 struct ip *, ip, struct ip6_hdr *, NULL);
1187
1188 error = ipsec4_output(&ipsec_state, sp, flags);
1189
1190 m0 = m = ipsec_state.m;
1191
1192 if (flags & IP_ROUTETOIF) {
1193 /*
1194 * if we have tunnel mode SA, we may need to ignore
1195 * IP_ROUTETOIF.
1196 */
1197 if (ipsec_state.tunneled) {
1198 flags &= ~IP_ROUTETOIF;
1199 ipsec_saved_route = ro;
1200 ro = &ipsec_state.ro;
1201 }
1202 } else {
1203 ipsec_saved_route = ro;
1204 ro = &ipsec_state.ro;
1205 }
1206 dst = (struct sockaddr_in *)(void *)ipsec_state.dst;
1207 if (error) {
1208 /* mbuf is already reclaimed in ipsec4_output. */
1209 m0 = NULL;
1210 switch (error) {
1211 case EHOSTUNREACH:
1212 case ENETUNREACH:
1213 case EMSGSIZE:
1214 case ENOBUFS:
1215 case ENOMEM:
1216 break;
1217 default:
1218 printf("ip4_output (ipsec): error code %d\n", error);
1219 /*fall through*/
1220 case ENOENT:
1221 /* don't show these error codes to the user */
1222 error = 0;
1223 break;
1224 }
1225 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 4,0,0,0,0);
1226 goto bad;
1227 }
1228 }
1229
1230 /* be sure to update variables that are affected by ipsec4_output() */
1231 ip = mtod(m, struct ip *);
1232
1233 #ifdef _IP_VHL
1234 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1235 #else
1236 hlen = ip->ip_hl << 2;
1237 #endif
1238 /* Check that there wasn't a route change and src is still valid */
1239 if (ro->ro_rt != NULL && ro->ro_rt->generation_id != route_generation) {
1240 if ((src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL &&
1241 ((flags & (IP_ROUTETOIF | IP_FORWARDING)) == 0)) {
1242 error = EADDRNOTAVAIL;
1243 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1244 5,0,0,0,0);
1245 goto bad;
1246 }
1247 rtfree(ro->ro_rt);
1248 ro->ro_rt = NULL;
1249 if (src_ia != NULL)
1250 IFA_REMREF(&src_ia->ia_ifa);
1251 }
1252
1253 if (ro->ro_rt == NULL) {
1254 if ((flags & IP_ROUTETOIF) == 0) {
1255 printf("ip_output: can't update route after "
1256 "IPsec processing\n");
1257 error = EHOSTUNREACH; /*XXX*/
1258 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1259 6,0,0,0,0);
1260 goto bad;
1261 }
1262 } else {
1263 if (ia)
1264 IFA_REMREF(&ia->ia_ifa);
1265 RT_LOCK_SPIN(ro->ro_rt);
1266 ia = ifatoia(ro->ro_rt->rt_ifa);
1267 if (ia) {
1268 /* Become a regular mutex */
1269 RT_CONVERT_LOCK(ro->ro_rt);
1270 IFA_ADDREF(&ia->ia_ifa);
1271 }
1272 ifp = ro->ro_rt->rt_ifp;
1273 RT_UNLOCK(ro->ro_rt);
1274 }
1275
1276 /* make it flipped, again. */
1277
1278 #if BYTE_ORDER != BIG_ENDIAN
1279 NTOHS(ip->ip_len);
1280 NTOHS(ip->ip_off);
1281 #endif
1282
1283 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END, 7,0xff,0xff,0xff,0xff);
1284
1285 /* Pass to filters again */
1286 if (!TAILQ_EMPTY(&ipv4_filters)) {
1287 struct ipfilter *filter;
1288
1289 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1290
1291 /* Check that a TSO frame isn't passed to a filter.
1292 * This could happen if a filter is inserted while
1293 * TCP is sending the TSO packet.
1294 */
1295 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1296 error = EMSGSIZE;
1297 goto bad;
1298 }
1299
1300 ipf_ref();
1301
1302 /* 4135317 - always pass network byte order to filter */
1303
1304 #if BYTE_ORDER != BIG_ENDIAN
1305 HTONS(ip->ip_len);
1306 HTONS(ip->ip_off);
1307 #endif
1308
1309 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1310 if (filter->ipf_filter.ipf_output) {
1311 errno_t result;
1312 result = filter->ipf_filter.ipf_output(filter->ipf_filter.cookie, (mbuf_t*)&m, ippo);
1313 if (result == EJUSTRETURN) {
1314 ipf_unref();
1315 goto done;
1316 }
1317 if (result != 0) {
1318 ipf_unref();
1319 goto bad;
1320 }
1321 }
1322 }
1323
1324 /* set back to host byte order */
1325 ip = mtod(m, struct ip *);
1326
1327 #if BYTE_ORDER != BIG_ENDIAN
1328 NTOHS(ip->ip_len);
1329 NTOHS(ip->ip_off);
1330 #endif
1331
1332 ipf_unref();
1333 }
1334 skip_ipsec:
1335 #endif /*IPSEC*/
1336
1337 #if IPFIREWALL
1338 /*
1339 * Check with the firewall...
1340 * but not if we are already being fwd'd from a firewall.
1341 */
1342 if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) {
1343 struct sockaddr_in *old = dst;
1344
1345 args.fwa_m = m;
1346 args.fwa_next_hop = dst;
1347 args.fwa_oif = ifp;
1348 off = ip_fw_chk_ptr(&args);
1349 m = args.fwa_m;
1350 dst = args.fwa_next_hop;
1351
1352 /*
1353 * On return we must do the following:
1354 * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new)
1355 * 1<=off<= 0xffff -> DIVERT
1356 * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1357 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet
1358 * dst != old -> IPFIREWALL_FORWARD
1359 * off==0, dst==old -> accept
1360 * If some of the above modules is not compiled in, then
1361 * we should't have to check the corresponding condition
1362 * (because the ipfw control socket should not accept
1363 * unsupported rules), but better play safe and drop
1364 * packets in case of doubt.
1365 */
1366 m0 = m;
1367 if ( (off & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1368 if (m)
1369 m_freem(m);
1370 error = EACCES ;
1371 goto done ;
1372 }
1373 ip = mtod(m, struct ip *);
1374
1375 if (off == 0 && dst == old) {/* common case */
1376 goto pass ;
1377 }
1378 #if DUMMYNET
1379 if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) {
1380 /*
1381 * pass the pkt to dummynet. Need to include
1382 * pipe number, m, ifp, ro, dst because these are
1383 * not recomputed in the next pass.
1384 * All other parameters have been already used and
1385 * so they are not needed anymore.
1386 * XXX note: if the ifp or ro entry are deleted
1387 * while a pkt is in dummynet, we are in trouble!
1388 */
1389 args.fwa_ro = ro;
1390 args.fwa_dst = dst;
1391 args.fwa_oflags = flags;
1392 if (flags & IP_OUTARGS)
1393 args.fwa_ipoa = ipoa;
1394
1395 error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT,
1396 &args, DN_CLIENT_IPFW);
1397 goto done;
1398 }
1399 #endif /* DUMMYNET */
1400 #if IPDIVERT
1401 if (off != 0 && (off & IP_FW_PORT_DYNT_FLAG) == 0) {
1402 struct mbuf *clone = NULL;
1403
1404 /* Clone packet if we're doing a 'tee' */
1405 if ((off & IP_FW_PORT_TEE_FLAG) != 0)
1406 clone = m_dup(m, M_DONTWAIT);
1407 /*
1408 * XXX
1409 * delayed checksums are not currently compatible
1410 * with divert sockets.
1411 */
1412 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1413 in_delayed_cksum(m);
1414 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1415 }
1416
1417 /* Restore packet header fields to original values */
1418
1419 #if BYTE_ORDER != BIG_ENDIAN
1420 HTONS(ip->ip_len);
1421 HTONS(ip->ip_off);
1422 #endif
1423
1424 /* Deliver packet to divert input routine */
1425 divert_packet(m, 0, off & 0xffff, args.fwa_divert_rule);
1426
1427 /* If 'tee', continue with original packet */
1428 if (clone != NULL) {
1429 m0 = m = clone;
1430 ip = mtod(m, struct ip *);
1431 goto pass;
1432 }
1433 goto done;
1434 }
1435 #endif
1436
1437 #if IPFIREWALL_FORWARD
1438 /* Here we check dst to make sure it's directly reachable on the
1439 * interface we previously thought it was.
1440 * If it isn't (which may be likely in some situations) we have
1441 * to re-route it (ie, find a route for the next-hop and the
1442 * associated interface) and set them here. This is nested
1443 * forwarding which in most cases is undesirable, except where
1444 * such control is nigh impossible. So we do it here.
1445 * And I'm babbling.
1446 */
1447 if (off == 0 && old != dst) {
1448 struct in_ifaddr *ia_fw;
1449
1450 /* It's changed... */
1451 /* There must be a better way to do this next line... */
1452 static struct route sro_fwd, *ro_fwd = &sro_fwd;
1453 #if IPFIREWALL_FORWARD_DEBUG
1454 printf("IPFIREWALL_FORWARD: New dst ip: ");
1455 print_ip(dst->sin_addr);
1456 printf("\n");
1457 #endif
1458 /*
1459 * We need to figure out if we have been forwarded
1460 * to a local socket. If so then we should somehow
1461 * "loop back" to ip_input, and get directed to the
1462 * PCB as if we had received this packet. This is
1463 * because it may be dificult to identify the packets
1464 * you want to forward until they are being output
1465 * and have selected an interface. (e.g. locally
1466 * initiated packets) If we used the loopback inteface,
1467 * we would not be able to control what happens
1468 * as the packet runs through ip_input() as
1469 * it is done through a ISR.
1470 */
1471 lck_rw_lock_shared(in_ifaddr_rwlock);
1472 TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1473 /*
1474 * If the addr to forward to is one
1475 * of ours, we pretend to
1476 * be the destination for this packet.
1477 */
1478 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1479 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1480 dst->sin_addr.s_addr) {
1481 IFA_UNLOCK(&ia_fw->ia_ifa);
1482 break;
1483 }
1484 IFA_UNLOCK(&ia_fw->ia_ifa);
1485 }
1486 lck_rw_done(in_ifaddr_rwlock);
1487 if (ia_fw) {
1488 /* tell ip_input "dont filter" */
1489 struct m_tag *fwd_tag;
1490 struct ip_fwd_tag *ipfwd_tag;
1491
1492 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1493 KERNEL_TAG_TYPE_IPFORWARD,
1494 sizeof (*ipfwd_tag), M_NOWAIT, m);
1495 if (fwd_tag == NULL) {
1496 error = ENOBUFS;
1497 goto bad;
1498 }
1499
1500 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1501 ipfwd_tag->next_hop = args.fwa_next_hop;
1502
1503 m_tag_prepend(m, fwd_tag);
1504
1505 if (m->m_pkthdr.rcvif == NULL)
1506 m->m_pkthdr.rcvif = lo_ifp;
1507 if ((~IF_HWASSIST_CSUM_FLAGS(m->m_pkthdr.rcvif->if_hwassist) &
1508 m->m_pkthdr.csum_flags) == 0) {
1509 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1510 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1511 m->m_pkthdr.csum_flags |=
1512 CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
1513 m->m_pkthdr.csum_data = 0xffff;
1514 }
1515 m->m_pkthdr.csum_flags |=
1516 CSUM_IP_CHECKED | CSUM_IP_VALID;
1517 }
1518 else if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1519 in_delayed_cksum(m);
1520 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1521 ip->ip_sum = in_cksum(m, hlen);
1522 }
1523
1524 #if BYTE_ORDER != BIG_ENDIAN
1525 HTONS(ip->ip_len);
1526 HTONS(ip->ip_off);
1527 #endif
1528
1529 /* we need to call dlil_output to run filters
1530 * and resync to avoid recursion loops.
1531 */
1532 if (lo_ifp) {
1533 dlil_output(lo_ifp, PF_INET, m, 0,
1534 (struct sockaddr *)dst, 0, adv);
1535 }
1536 else {
1537 printf("ip_output: no loopback ifp for forwarding!!!\n");
1538 }
1539 goto done;
1540 }
1541 /* Some of the logic for this was
1542 * nicked from above.
1543 *
1544 * This rewrites the cached route in a local PCB.
1545 * Is this what we want to do?
1546 */
1547 bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
1548
1549 ro_fwd->ro_rt = NULL;
1550 rtalloc_ign(ro_fwd, RTF_PRCLONING);
1551
1552 if (ro_fwd->ro_rt == NULL) {
1553 OSAddAtomic(1, &ipstat.ips_noroute);
1554 error = EHOSTUNREACH;
1555 goto bad;
1556 }
1557
1558 RT_LOCK_SPIN(ro_fwd->ro_rt);
1559 ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1560 if (ia_fw != NULL) {
1561 /* Become a regular mutex */
1562 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1563 IFA_ADDREF(&ia_fw->ia_ifa);
1564 }
1565 ifp = ro_fwd->ro_rt->rt_ifp;
1566 ro_fwd->ro_rt->rt_use++;
1567 if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
1568 dst = (struct sockaddr_in *)(void *)ro_fwd->ro_rt->rt_gateway;
1569 if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1570 isbroadcast =
1571 (ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1572 } else {
1573 /* Become a regular mutex */
1574 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1575 isbroadcast = in_broadcast(dst->sin_addr, ifp);
1576 }
1577 RT_UNLOCK(ro_fwd->ro_rt);
1578 rtfree(ro->ro_rt);
1579 ro->ro_rt = ro_fwd->ro_rt;
1580 dst = (struct sockaddr_in *)(void *)&ro_fwd->ro_dst;
1581
1582 /*
1583 * If we added a default src ip earlier,
1584 * which would have been gotten from the-then
1585 * interface, do it again, from the new one.
1586 */
1587 if (ia_fw != NULL) {
1588 if (fwd_rewrite_src) {
1589 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1590 ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1591 IFA_UNLOCK(&ia_fw->ia_ifa);
1592 }
1593 IFA_REMREF(&ia_fw->ia_ifa);
1594 }
1595 goto pass ;
1596 }
1597 #endif /* IPFIREWALL_FORWARD */
1598 /*
1599 * if we get here, none of the above matches, and
1600 * we have to drop the pkt
1601 */
1602 m_freem(m);
1603 error = EACCES; /* not sure this is the right error msg */
1604 goto done;
1605 }
1606
1607 pass:
1608 #endif /* IPFIREWALL */
1609 #if __APPLE__
1610 /* Do not allow loopback address to wind up on a wire */
1611 if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
1612 ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1613 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1614 OSAddAtomic(1, &ipstat.ips_badaddr);
1615 m_freem(m);
1616 /*
1617 * Do not simply drop the packet just like a firewall -- we want the
1618 * the application to feel the pain.
1619 * Return ENETUNREACH like ip6_output does in some similar cases.
1620 * This can startle the otherwise clueless process that specifies
1621 * loopback as the source address.
1622 */
1623 error = ENETUNREACH;
1624 goto done;
1625 }
1626 #endif
1627 m->m_pkthdr.csum_flags |= CSUM_IP;
1628 tso = (ifp->if_hwassist & IFNET_TSO_IPV4) && (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4);
1629
1630 sw_csum = m->m_pkthdr.csum_flags
1631 & ~IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1632
1633 if ((ifp->if_hwassist & CSUM_TCP_SUM16) != 0) {
1634 /*
1635 * Special case code for GMACE
1636 * frames that can be checksumed by GMACE SUM16 HW:
1637 * frame >64, no fragments, no UDP
1638 */
1639 if (apple_hwcksum_tx && (m->m_pkthdr.csum_flags & CSUM_TCP)
1640 && (ip->ip_len > 50) && (ip->ip_len <= ifp->if_mtu)) {
1641 /* Apple GMAC HW, expects STUFF_OFFSET << 16 | START_OFFSET */
1642 u_short offset = (IP_VHL_HL(ip->ip_vhl) << 2) +14 ; /* IP+Enet header length */
1643 u_short csumprev= m->m_pkthdr.csum_data & 0xFFFF;
1644 m->m_pkthdr.csum_flags = CSUM_DATA_VALID | CSUM_TCP_SUM16; /* for GMAC */
1645 m->m_pkthdr.csum_data = (csumprev + offset) << 16 ;
1646 m->m_pkthdr.csum_data += offset;
1647 sw_csum = CSUM_DELAY_IP; /* do IP hdr chksum in software */
1648 } else {
1649 /* let the software handle any UDP or TCP checksums */
1650 sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
1651 }
1652 } else if (apple_hwcksum_tx == 0) {
1653 sw_csum |= (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
1654 m->m_pkthdr.csum_flags;
1655 }
1656
1657 if (sw_csum & CSUM_DELAY_DATA) {
1658 in_delayed_cksum(m);
1659 sw_csum &= ~CSUM_DELAY_DATA;
1660 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1661 }
1662
1663 if (apple_hwcksum_tx != 0) {
1664 m->m_pkthdr.csum_flags &=
1665 IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist);
1666 } else {
1667 m->m_pkthdr.csum_flags = 0;
1668 }
1669
1670 /*
1671 * If small enough for interface, or the interface will take
1672 * care of the fragmentation for us, can just send directly.
1673 */
1674 if ((u_short)ip->ip_len <= ifp->if_mtu || tso ||
1675 ifp->if_hwassist & CSUM_FRAGMENT) {
1676 if (tso)
1677 m->m_pkthdr.csum_flags |= CSUM_TSO_IPV4;
1678
1679
1680 #if BYTE_ORDER != BIG_ENDIAN
1681 HTONS(ip->ip_len);
1682 HTONS(ip->ip_off);
1683 #endif
1684
1685 ip->ip_sum = 0;
1686 if (sw_csum & CSUM_DELAY_IP) {
1687 ip->ip_sum = in_cksum(m, hlen);
1688 }
1689
1690 #ifndef __APPLE__
1691 /* Record statistics for this interface address. */
1692 if (!(flags & IP_FORWARDING) && ia != NULL) {
1693 ia->ia_ifa.if_opackets++;
1694 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1695 }
1696 #endif
1697
1698 #if IPSEC
1699 /* clean ipsec history once it goes out of the node */
1700 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1701 ipsec_delaux(m);
1702 #endif
1703 if (packetchain == 0) {
1704 if (ro->ro_rt && nstat_collect)
1705 nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0);
1706 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1707 (struct sockaddr *)dst, 0, adv);
1708 goto done;
1709 }
1710 else { /* packet chaining allows us to reuse the route for all packets */
1711 bytecnt += m->m_pkthdr.len;
1712 mppn = &m->m_nextpkt;
1713 m = m->m_nextpkt;
1714 if (m == NULL) {
1715 #if PF
1716 sendchain:
1717 #endif /* PF */
1718 if (pktcnt > ip_maxchainsent)
1719 ip_maxchainsent = pktcnt;
1720 if (ro->ro_rt && nstat_collect)
1721 nstat_route_tx(ro->ro_rt, pktcnt, bytecnt, 0);
1722 //send
1723 error = dlil_output(ifp, PF_INET, packetlist,
1724 ro->ro_rt, (struct sockaddr *)dst, 0, adv);
1725 pktcnt = 0;
1726 bytecnt = 0;
1727 goto done;
1728
1729 }
1730 m0 = m;
1731 pktcnt++;
1732 goto loopit;
1733 }
1734 }
1735 /*
1736 * Too large for interface; fragment if possible.
1737 * Must be able to put at least 8 bytes per fragment.
1738 */
1739
1740 if (ip->ip_off & IP_DF || (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) ||
1741 pktcnt > 0) {
1742 error = EMSGSIZE;
1743 /*
1744 * This case can happen if the user changed the MTU
1745 * of an interface after enabling IP on it. Because
1746 * most netifs don't keep track of routes pointing to
1747 * them, there is no way for one to update all its
1748 * routes when the MTU is changed.
1749 */
1750 if (ro->ro_rt) {
1751 RT_LOCK_SPIN(ro->ro_rt);
1752 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST))
1753 && !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU)
1754 && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
1755 ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
1756 }
1757 RT_UNLOCK(ro->ro_rt);
1758 }
1759 if (pktcnt > 0) {
1760 m0 = packetlist;
1761 }
1762 OSAddAtomic(1, &ipstat.ips_cantfrag);
1763 goto bad;
1764 }
1765
1766 error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum);
1767 if (error != 0) {
1768 m0 = m = NULL;
1769 goto bad;
1770 }
1771
1772 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1773 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1774
1775 for (m = m0; m; m = m0) {
1776 m0 = m->m_nextpkt;
1777 m->m_nextpkt = 0;
1778 #if IPSEC
1779 /* clean ipsec history once it goes out of the node */
1780 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0)
1781 ipsec_delaux(m);
1782 #endif
1783 if (error == 0) {
1784 #ifndef __APPLE__
1785 /* Record statistics for this interface address. */
1786 if (ia != NULL) {
1787 ia->ia_ifa.if_opackets++;
1788 ia->ia_ifa.if_obytes += m->m_pkthdr.len;
1789 }
1790 #endif
1791 if ((packetchain != 0) && (pktcnt > 0))
1792 panic("ip_output: mix of packet in packetlist is wrong=%p", packetlist);
1793 if (ro->ro_rt && nstat_collect)
1794 nstat_route_tx(ro->ro_rt, 1, m->m_pkthdr.len, 0);
1795 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1796 (struct sockaddr *)dst, 0, adv);
1797 } else
1798 m_freem(m);
1799 }
1800
1801 if (error == 0)
1802 OSAddAtomic(1, &ipstat.ips_fragmented);
1803
1804 done:
1805 if (ia) {
1806 IFA_REMREF(&ia->ia_ifa);
1807 ia = NULL;
1808 }
1809 #if IPSEC
1810 if (ipsec_bypass == 0 && (flags & IP_NOIPSEC) == 0) {
1811 if (ipsec_state.ro.ro_rt)
1812 rtfree(ipsec_state.ro.ro_rt);
1813 if (sp != NULL) {
1814 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1815 printf("DP ip_output call free SP:%x\n", sp));
1816 key_freesp(sp, KEY_SADB_UNLOCKED);
1817 }
1818 }
1819 #endif /* IPSEC */
1820
1821 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error,0,0,0,0);
1822 return (error);
1823 bad:
1824 m_freem(m0);
1825 goto done;
1826 }
1827
1828 int
1829 ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
1830 {
1831 struct ip *ip, *mhip;
1832 int len, hlen, mhlen, firstlen, off, error = 0;
1833 struct mbuf **mnext = &m->m_nextpkt, *m0;
1834 int nfrags = 1;
1835
1836 ip = mtod(m, struct ip *);
1837 #ifdef _IP_VHL
1838 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1839 #else
1840 hlen = ip->ip_hl << 2;
1841 #endif
1842
1843 firstlen = len = (mtu - hlen) &~ 7;
1844 if (len < 8) {
1845 m_freem(m);
1846 return (EMSGSIZE);
1847 }
1848
1849 /*
1850 * if the interface will not calculate checksums on
1851 * fragmented packets, then do it here.
1852 */
1853 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA &&
1854 (ifp->if_hwassist & CSUM_IP_FRAGS) == 0) {
1855 in_delayed_cksum(m);
1856 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1857 }
1858
1859 /*
1860 * Loop through length of segment after first fragment,
1861 * make new header and copy data of each part and link onto chain.
1862 */
1863 m0 = m;
1864 mhlen = sizeof (struct ip);
1865 for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1866 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
1867 if (m == 0) {
1868 error = ENOBUFS;
1869 OSAddAtomic(1, &ipstat.ips_odropped);
1870 goto sendorfree;
1871 }
1872 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1873 m->m_data += max_linkhdr;
1874 mhip = mtod(m, struct ip *);
1875 *mhip = *ip;
1876 if (hlen > sizeof (struct ip)) {
1877 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
1878 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1879 }
1880 m->m_len = mhlen;
1881 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
1882 if (ip->ip_off & IP_MF)
1883 mhip->ip_off |= IP_MF;
1884 if (off + len >= (u_short)ip->ip_len)
1885 len = (u_short)ip->ip_len - off;
1886 else
1887 mhip->ip_off |= IP_MF;
1888 mhip->ip_len = htons((u_short)(len + mhlen));
1889 m->m_next = m_copy(m0, off, len);
1890 if (m->m_next == 0) {
1891 (void) m_free(m);
1892 error = ENOBUFS; /* ??? */
1893 OSAddAtomic(1, &ipstat.ips_odropped);
1894 goto sendorfree;
1895 }
1896 m->m_pkthdr.len = mhlen + len;
1897 m->m_pkthdr.rcvif = 0;
1898 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1899 m->m_pkthdr.socket_id = m0->m_pkthdr.socket_id;
1900
1901 M_COPY_PFTAG(m, m0);
1902 m_set_service_class(m, m0->m_pkthdr.svc);
1903
1904 #if CONFIG_MACF_NET
1905 mac_netinet_fragment(m0, m);
1906 #endif
1907
1908 #if BYTE_ORDER != BIG_ENDIAN
1909 HTONS(mhip->ip_off);
1910 #endif
1911
1912 mhip->ip_sum = 0;
1913 if (sw_csum & CSUM_DELAY_IP) {
1914 mhip->ip_sum = in_cksum(m, mhlen);
1915 }
1916 *mnext = m;
1917 mnext = &m->m_nextpkt;
1918 nfrags++;
1919 }
1920 OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1921
1922 /* set first/last markers for fragment chain */
1923 m->m_flags |= M_LASTFRAG;
1924 m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1925 m0->m_pkthdr.csum_data = nfrags;
1926
1927 /*
1928 * Update first fragment by trimming what's been copied out
1929 * and updating header, then send each fragment (in order).
1930 */
1931 m = m0;
1932 m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1933 m->m_pkthdr.len = hlen + firstlen;
1934 ip->ip_len = htons((u_short)m->m_pkthdr.len);
1935 ip->ip_off |= IP_MF;
1936
1937 #if BYTE_ORDER != BIG_ENDIAN
1938 HTONS(ip->ip_off);
1939 #endif
1940
1941 ip->ip_sum = 0;
1942 if (sw_csum & CSUM_DELAY_IP) {
1943 ip->ip_sum = in_cksum(m, hlen);
1944 }
1945 sendorfree:
1946 if (error)
1947 m_freem_list(m0);
1948
1949 return (error);
1950 }
1951
1952 static void
1953 ip_out_cksum_stats(int proto, u_int32_t len)
1954 {
1955 switch (proto) {
1956 case IPPROTO_TCP:
1957 tcp_out_cksum_stats(len);
1958 break;
1959 case IPPROTO_UDP:
1960 udp_out_cksum_stats(len);
1961 break;
1962 default:
1963 /* keep only TCP or UDP stats for now */
1964 break;
1965 }
1966 }
1967
1968 void
1969 in_delayed_cksum_offset(struct mbuf *m0, int ip_offset)
1970 {
1971 struct ip *ip;
1972 unsigned char buf[sizeof(struct ip)];
1973 u_short csum, offset, ip_len;
1974
1975 /* Save copy of first mbuf pointer and the ip_offset before modifying */
1976 struct mbuf *m = m0;
1977 int ip_offset_copy = ip_offset;
1978
1979 while (ip_offset >= m->m_len) {
1980 ip_offset -= m->m_len;
1981 m = m->m_next;
1982 if (m == NULL) {
1983 printf("in_delayed_cksum_withoffset failed - "
1984 "ip_offset wasn't in the packet\n");
1985 return;
1986 }
1987 }
1988
1989 /*
1990 * In case the IP header is not contiguous, or not 32-bit
1991 * aligned, copy it to a local buffer.
1992 */
1993 if ((ip_offset + sizeof(struct ip) > m->m_len) ||
1994 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) {
1995 #if DEBUG
1996 printf("delayed m_pullup, m->len: %d off: %d\n",
1997 m->m_len, ip_offset);
1998 #endif
1999 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
2000
2001 ip = (struct ip *)(void *)buf;
2002 } else {
2003 ip = (struct ip*)(void *)(m->m_data + ip_offset);
2004 }
2005
2006 /* Gross */
2007 if (ip_offset) {
2008 m->m_len -= ip_offset;
2009 m->m_data += ip_offset;
2010 }
2011
2012 offset = IP_VHL_HL(ip->ip_vhl) << 2 ;
2013
2014 /*
2015 * We could be in the context of an IP or interface filter; in the
2016 * former case, ip_len would be in host (correct) order while for
2017 * the latter it would be in network order. Because of this, we
2018 * attempt to interpret the length field by comparing it against
2019 * the actual packet length. If the comparison fails, byte swap
2020 * the length and check again. If it still fails, then the packet
2021 * is bogus and we give up.
2022 */
2023 ip_len = ip->ip_len;
2024 if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) {
2025 ip_len = SWAP16(ip_len);
2026 if (ip_len != (m0->m_pkthdr.len - ip_offset_copy)) {
2027 printf("in_delayed_cksum_offset: ip_len %d (%d) "
2028 "doesn't match actual length %d\n", ip->ip_len,
2029 ip_len, (m0->m_pkthdr.len - ip_offset_copy));
2030 return;
2031 }
2032 }
2033
2034 csum = in_cksum_skip(m, ip_len, offset);
2035
2036 /* Update stats */
2037 ip_out_cksum_stats(ip->ip_p, ip_len - offset);
2038
2039 if (m0->m_pkthdr.csum_flags & CSUM_UDP && csum == 0)
2040 csum = 0xffff;
2041 offset += m0->m_pkthdr.csum_data & 0xFFFF; /* checksum offset */
2042
2043 /* Gross */
2044 if (ip_offset) {
2045 if (M_LEADINGSPACE(m) < ip_offset)
2046 panic("in_delayed_cksum_offset - chain modified!\n");
2047 m->m_len += ip_offset;
2048 m->m_data -= ip_offset;
2049 }
2050
2051 if (offset > ip_len) /* bogus offset */
2052 return;
2053
2054 /* Insert the checksum in the existing chain */
2055 if (offset + ip_offset + sizeof(u_short) > m->m_len) {
2056 char tmp[2];
2057
2058 #if DEBUG
2059 printf("delayed m_copyback, m->len: %d off: %d p: %d\n",
2060 m->m_len, offset + ip_offset, ip->ip_p);
2061 #endif
2062 *(u_short *)(void *)tmp = csum;
2063 m_copyback(m, offset + ip_offset, 2, tmp);
2064 } else if (IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) {
2065 *(u_short *)(void *)(m->m_data + offset + ip_offset) = csum;
2066 } else {
2067 bcopy(&csum, (m->m_data + offset + ip_offset), sizeof (csum));
2068 }
2069 }
2070
2071 void
2072 in_delayed_cksum(struct mbuf *m)
2073 {
2074 in_delayed_cksum_offset(m, 0);
2075 }
2076
2077 void
2078 in_cksum_offset(struct mbuf* m, size_t ip_offset)
2079 {
2080 struct ip* ip = NULL;
2081 int hlen = 0;
2082 unsigned char buf[sizeof(struct ip)];
2083 int swapped = 0;
2084
2085 /* Save copy of first mbuf pointer and the ip_offset before modifying */
2086 struct mbuf* m0 = m;
2087 size_t ip_offset_copy = ip_offset;
2088
2089 while (ip_offset >= m->m_len) {
2090 ip_offset -= m->m_len;
2091 m = m->m_next;
2092 if (m == NULL) {
2093 printf("in_cksum_offset failed - ip_offset wasn't "
2094 "in the packet\n");
2095 return;
2096 }
2097 }
2098
2099 /*
2100 * In case the IP header is not contiguous, or not 32-bit
2101 * aligned, copy it to a local buffer.
2102 */
2103 if ((ip_offset + sizeof(struct ip) > m->m_len) ||
2104 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) {
2105 #if DEBUG
2106 printf("in_cksum_offset - delayed m_pullup, m->len: %d "
2107 "off: %lu\n", m->m_len, ip_offset);
2108 #endif
2109 m_copydata(m, ip_offset, sizeof(struct ip), (caddr_t) buf);
2110
2111 ip = (struct ip *)(void *)buf;
2112 ip->ip_sum = 0;
2113 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2,
2114 (caddr_t)&ip->ip_sum);
2115 } else {
2116 ip = (struct ip*)(void *)(m->m_data + ip_offset);
2117 ip->ip_sum = 0;
2118 }
2119
2120 /* Gross */
2121 if (ip_offset) {
2122 m->m_len -= ip_offset;
2123 m->m_data += ip_offset;
2124 }
2125
2126 #ifdef _IP_VHL
2127 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2128 #else
2129 hlen = ip->ip_hl << 2;
2130 #endif
2131 /*
2132 * We could be in the context of an IP or interface filter; in the
2133 * former case, ip_len would be in host order while for the latter
2134 * it would be in network (correct) order. Because of this, we
2135 * attempt to interpret the length field by comparing it against
2136 * the actual packet length. If the comparison fails, byte swap
2137 * the length and check again. If it still fails, then the packet
2138 * is bogus and we give up.
2139 */
2140 if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) {
2141 ip->ip_len = SWAP16(ip->ip_len);
2142 swapped = 1;
2143 if (ntohs(ip->ip_len) != (m0->m_pkthdr.len - ip_offset_copy)) {
2144 ip->ip_len = SWAP16(ip->ip_len);
2145 printf("in_cksum_offset: ip_len %d (%d) "
2146 "doesn't match actual length %lu\n",
2147 ip->ip_len, SWAP16(ip->ip_len),
2148 (m0->m_pkthdr.len - ip_offset_copy));
2149 return;
2150 }
2151 }
2152
2153 ip->ip_sum = 0;
2154 ip->ip_sum = in_cksum(m, hlen);
2155 if (swapped)
2156 ip->ip_len = SWAP16(ip->ip_len);
2157
2158 /* Gross */
2159 if (ip_offset) {
2160 if (M_LEADINGSPACE(m) < ip_offset)
2161 panic("in_cksum_offset - chain modified!\n");
2162 m->m_len += ip_offset;
2163 m->m_data -= ip_offset;
2164 }
2165
2166 /*
2167 * Insert the checksum in the existing chain if IP header not
2168 * contiguous, or if it's not 32-bit aligned, i.e. all the cases
2169 * where it was copied to a local buffer.
2170 */
2171 if (ip_offset + sizeof(struct ip) > m->m_len) {
2172 char tmp[2];
2173
2174 #if DEBUG
2175 printf("in_cksum_offset m_copyback, m->len: %u off: %lu "
2176 "p: %d\n", m->m_len,
2177 ip_offset + offsetof(struct ip, ip_sum), ip->ip_p);
2178 #endif
2179 *(u_short *)(void *)tmp = ip->ip_sum;
2180 m_copyback(m, ip_offset + offsetof(struct ip, ip_sum), 2, tmp);
2181 } else if (!IP_HDR_ALIGNED_P(mtod(m, caddr_t) + ip_offset)) {
2182 bcopy(&ip->ip_sum,
2183 (m->m_data + ip_offset + offsetof(struct ip, ip_sum)),
2184 sizeof (u_short));
2185 }
2186 }
2187
2188 /*
2189 * Insert IP options into preformed packet.
2190 * Adjust IP destination as required for IP source routing,
2191 * as indicated by a non-zero in_addr at the start of the options.
2192 *
2193 * XXX This routine assumes that the packet has no options in place.
2194 */
2195 static struct mbuf *
2196 ip_insertoptions(m, opt, phlen)
2197 register struct mbuf *m;
2198 struct mbuf *opt;
2199 int *phlen;
2200 {
2201 register struct ipoption *p = mtod(opt, struct ipoption *);
2202 struct mbuf *n;
2203 register struct ip *ip = mtod(m, struct ip *);
2204 unsigned optlen;
2205
2206 optlen = opt->m_len - sizeof(p->ipopt_dst);
2207 if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
2208 return (m); /* XXX should fail */
2209 if (p->ipopt_dst.s_addr)
2210 ip->ip_dst = p->ipopt_dst;
2211 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2212 MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2213 if (n == 0)
2214 return (m);
2215 n->m_pkthdr.rcvif = 0;
2216 #if CONFIG_MACF_NET
2217 mac_mbuf_label_copy(m, n);
2218 #endif
2219 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2220 m->m_len -= sizeof(struct ip);
2221 m->m_data += sizeof(struct ip);
2222 n->m_next = m;
2223 m = n;
2224 m->m_len = optlen + sizeof(struct ip);
2225 m->m_data += max_linkhdr;
2226 (void)memcpy(mtod(m, void *), ip, sizeof(struct ip));
2227 } else {
2228 m->m_data -= optlen;
2229 m->m_len += optlen;
2230 m->m_pkthdr.len += optlen;
2231 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
2232 }
2233 ip = mtod(m, struct ip *);
2234 bcopy(p->ipopt_list, ip + 1, optlen);
2235 *phlen = sizeof(struct ip) + optlen;
2236 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2237 ip->ip_len += optlen;
2238 return (m);
2239 }
2240
2241 /*
2242 * Copy options from ip to jp,
2243 * omitting those not copied during fragmentation.
2244 */
2245 int
2246 ip_optcopy(ip, jp)
2247 struct ip *ip, *jp;
2248 {
2249 register u_char *cp, *dp;
2250 int opt, optlen, cnt;
2251
2252 cp = (u_char *)(ip + 1);
2253 dp = (u_char *)(jp + 1);
2254 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
2255 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2256 opt = cp[0];
2257 if (opt == IPOPT_EOL)
2258 break;
2259 if (opt == IPOPT_NOP) {
2260 /* Preserve for IP mcast tunnel's LSRR alignment. */
2261 *dp++ = IPOPT_NOP;
2262 optlen = 1;
2263 continue;
2264 }
2265 #if DIAGNOSTIC
2266 if (cnt < IPOPT_OLEN + sizeof(*cp))
2267 panic("malformed IPv4 option passed to ip_optcopy");
2268 #endif
2269 optlen = cp[IPOPT_OLEN];
2270 #if DIAGNOSTIC
2271 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2272 panic("malformed IPv4 option passed to ip_optcopy");
2273 #endif
2274 /* bogus lengths should have been caught by ip_dooptions */
2275 if (optlen > cnt)
2276 optlen = cnt;
2277 if (IPOPT_COPIED(opt)) {
2278 bcopy(cp, dp, optlen);
2279 dp += optlen;
2280 }
2281 }
2282 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
2283 *dp++ = IPOPT_EOL;
2284 return (optlen);
2285 }
2286
2287 /*
2288 * IP socket option processing.
2289 */
2290 int
2291 ip_ctloutput(so, sopt)
2292 struct socket *so;
2293 struct sockopt *sopt;
2294 {
2295 struct inpcb *inp = sotoinpcb(so);
2296 int error, optval;
2297
2298 error = optval = 0;
2299 if (sopt->sopt_level != IPPROTO_IP) {
2300 return (EINVAL);
2301 }
2302
2303 switch (sopt->sopt_dir) {
2304 case SOPT_SET:
2305 switch (sopt->sopt_name) {
2306 case IP_OPTIONS:
2307 #ifdef notyet
2308 case IP_RETOPTS:
2309 #endif
2310 {
2311 struct mbuf *m;
2312 if (sopt->sopt_valsize > MLEN) {
2313 error = EMSGSIZE;
2314 break;
2315 }
2316 MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2317 MT_HEADER);
2318 if (m == 0) {
2319 error = ENOBUFS;
2320 break;
2321 }
2322 m->m_len = sopt->sopt_valsize;
2323 error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
2324 m->m_len);
2325 if (error)
2326 break;
2327
2328 return (ip_pcbopts(sopt->sopt_name, &inp->inp_options,
2329 m));
2330 }
2331
2332 case IP_TOS:
2333 case IP_TTL:
2334 case IP_RECVOPTS:
2335 case IP_RECVRETOPTS:
2336 case IP_RECVDSTADDR:
2337 case IP_RECVIF:
2338 case IP_RECVTTL:
2339 case IP_RECVPKTINFO:
2340 error = sooptcopyin(sopt, &optval, sizeof optval,
2341 sizeof optval);
2342 if (error)
2343 break;
2344
2345 switch (sopt->sopt_name) {
2346 case IP_TOS:
2347 inp->inp_ip_tos = optval;
2348 break;
2349
2350 case IP_TTL:
2351 inp->inp_ip_ttl = optval;
2352 break;
2353 #define OPTSET(bit) \
2354 if (optval) \
2355 inp->inp_flags |= bit; \
2356 else \
2357 inp->inp_flags &= ~bit;
2358
2359 case IP_RECVOPTS:
2360 OPTSET(INP_RECVOPTS);
2361 break;
2362
2363 case IP_RECVRETOPTS:
2364 OPTSET(INP_RECVRETOPTS);
2365 break;
2366
2367 case IP_RECVDSTADDR:
2368 OPTSET(INP_RECVDSTADDR);
2369 break;
2370
2371 case IP_RECVIF:
2372 OPTSET(INP_RECVIF);
2373 break;
2374
2375 case IP_RECVTTL:
2376 OPTSET(INP_RECVTTL);
2377 break;
2378
2379 case IP_RECVPKTINFO:
2380 OPTSET(INP_PKTINFO);
2381 break;
2382 }
2383 break;
2384 #undef OPTSET
2385
2386 #if CONFIG_FORCE_OUT_IFP
2387 /*
2388 * Apple private interface, similar to IP_BOUND_IF, except
2389 * that the parameter is a NULL-terminated string containing
2390 * the name of the network interface; an emptry string means
2391 * unbind. Applications are encouraged to use IP_BOUND_IF
2392 * instead, as that is the current "official" API.
2393 */
2394 case IP_FORCE_OUT_IFP: {
2395 char ifname[IFNAMSIZ];
2396 unsigned int ifscope;
2397
2398 /* This option is settable only for IPv4 */
2399 if (!(inp->inp_vflag & INP_IPV4)) {
2400 error = EINVAL;
2401 break;
2402 }
2403
2404 /* Verify interface name parameter is sane */
2405 if (sopt->sopt_valsize > sizeof(ifname)) {
2406 error = EINVAL;
2407 break;
2408 }
2409
2410 /* Copy the interface name */
2411 if (sopt->sopt_valsize != 0) {
2412 error = sooptcopyin(sopt, ifname,
2413 sizeof (ifname), sopt->sopt_valsize);
2414 if (error)
2415 break;
2416 }
2417
2418 if (sopt->sopt_valsize == 0 || ifname[0] == '\0') {
2419 /* Unbind this socket from any interface */
2420 ifscope = IFSCOPE_NONE;
2421 } else {
2422 ifnet_t ifp;
2423
2424 /* Verify name is NULL terminated */
2425 if (ifname[sopt->sopt_valsize - 1] != '\0') {
2426 error = EINVAL;
2427 break;
2428 }
2429
2430 /* Bail out if given bogus interface name */
2431 if (ifnet_find_by_name(ifname, &ifp) != 0) {
2432 error = ENXIO;
2433 break;
2434 }
2435
2436 /* Bind this socket to this interface */
2437 ifscope = ifp->if_index;
2438
2439 /*
2440 * Won't actually free; since we don't release
2441 * this later, we should do it now.
2442 */
2443 ifnet_release(ifp);
2444 }
2445 error = inp_bindif(inp, ifscope);
2446 }
2447 break;
2448 #endif
2449 /*
2450 * Multicast socket options are processed by the in_mcast
2451 * module.
2452 */
2453 case IP_MULTICAST_IF:
2454 case IP_MULTICAST_IFINDEX:
2455 case IP_MULTICAST_VIF:
2456 case IP_MULTICAST_TTL:
2457 case IP_MULTICAST_LOOP:
2458 case IP_ADD_MEMBERSHIP:
2459 case IP_DROP_MEMBERSHIP:
2460 case IP_ADD_SOURCE_MEMBERSHIP:
2461 case IP_DROP_SOURCE_MEMBERSHIP:
2462 case IP_BLOCK_SOURCE:
2463 case IP_UNBLOCK_SOURCE:
2464 case IP_MSFILTER:
2465 case MCAST_JOIN_GROUP:
2466 case MCAST_LEAVE_GROUP:
2467 case MCAST_JOIN_SOURCE_GROUP:
2468 case MCAST_LEAVE_SOURCE_GROUP:
2469 case MCAST_BLOCK_SOURCE:
2470 case MCAST_UNBLOCK_SOURCE:
2471 error = inp_setmoptions(inp, sopt);
2472 break;
2473
2474 case IP_PORTRANGE:
2475 error = sooptcopyin(sopt, &optval, sizeof optval,
2476 sizeof optval);
2477 if (error)
2478 break;
2479
2480 switch (optval) {
2481 case IP_PORTRANGE_DEFAULT:
2482 inp->inp_flags &= ~(INP_LOWPORT);
2483 inp->inp_flags &= ~(INP_HIGHPORT);
2484 break;
2485
2486 case IP_PORTRANGE_HIGH:
2487 inp->inp_flags &= ~(INP_LOWPORT);
2488 inp->inp_flags |= INP_HIGHPORT;
2489 break;
2490
2491 case IP_PORTRANGE_LOW:
2492 inp->inp_flags &= ~(INP_HIGHPORT);
2493 inp->inp_flags |= INP_LOWPORT;
2494 break;
2495
2496 default:
2497 error = EINVAL;
2498 break;
2499 }
2500 break;
2501
2502 #if IPSEC
2503 case IP_IPSEC_POLICY:
2504 {
2505 caddr_t req = NULL;
2506 size_t len = 0;
2507 int priv;
2508 struct mbuf *m;
2509 int optname;
2510
2511 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
2512 break;
2513 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
2514 break;
2515 priv = (proc_suser(sopt->sopt_p) == 0);
2516 if (m) {
2517 req = mtod(m, caddr_t);
2518 len = m->m_len;
2519 }
2520 optname = sopt->sopt_name;
2521 error = ipsec4_set_policy(inp, optname, req, len, priv);
2522 m_freem(m);
2523 break;
2524 }
2525 #endif /*IPSEC*/
2526
2527 #if TRAFFIC_MGT
2528 case IP_TRAFFIC_MGT_BACKGROUND:
2529 {
2530 unsigned background = 0;
2531 error = sooptcopyin(sopt, &background, sizeof(background), sizeof(background));
2532 if (error)
2533 break;
2534
2535 if (background) {
2536 socket_set_traffic_mgt_flags_locked(so,
2537 TRAFFIC_MGT_SO_BACKGROUND);
2538 } else {
2539 socket_clear_traffic_mgt_flags_locked(so,
2540 TRAFFIC_MGT_SO_BACKGROUND);
2541 }
2542
2543 break;
2544 }
2545 #endif /* TRAFFIC_MGT */
2546
2547 /*
2548 * On a multihomed system, scoped routing can be used to
2549 * restrict the source interface used for sending packets.
2550 * The socket option IP_BOUND_IF binds a particular AF_INET
2551 * socket to an interface such that data sent on the socket
2552 * is restricted to that interface. This is unlike the
2553 * SO_DONTROUTE option where the routing table is bypassed;
2554 * therefore it allows for a greater flexibility and control
2555 * over the system behavior, and does not place any restriction
2556 * on the destination address type (e.g. unicast, multicast,
2557 * or broadcast if applicable) or whether or not the host is
2558 * directly reachable. Note that in the multicast transmit
2559 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2560 * IP_BOUND_IF, since the former practically bypasses the
2561 * routing table; in this case, IP_BOUND_IF sets the default
2562 * interface used for sending multicast packets in the absence
2563 * of an explicit multicast transmit interface.
2564 */
2565 case IP_BOUND_IF:
2566 /* This option is settable only for IPv4 */
2567 if (!(inp->inp_vflag & INP_IPV4)) {
2568 error = EINVAL;
2569 break;
2570 }
2571
2572 error = sooptcopyin(sopt, &optval, sizeof (optval),
2573 sizeof (optval));
2574
2575 if (error)
2576 break;
2577
2578 error = inp_bindif(inp, optval);
2579 break;
2580
2581 case IP_NO_IFT_CELLULAR:
2582 /* This option is settable only for IPv4 */
2583 if (!(inp->inp_vflag & INP_IPV4)) {
2584 error = EINVAL;
2585 break;
2586 }
2587
2588 error = sooptcopyin(sopt, &optval, sizeof (optval),
2589 sizeof (optval));
2590
2591 if (error)
2592 break;
2593
2594 error = inp_nocellular(inp, optval);
2595 break;
2596
2597 case IP_OUT_IF:
2598 /* This option is not settable */
2599 error = EINVAL;
2600 break;
2601
2602 default:
2603 error = ENOPROTOOPT;
2604 break;
2605 }
2606 break;
2607
2608 case SOPT_GET:
2609 switch (sopt->sopt_name) {
2610 case IP_OPTIONS:
2611 case IP_RETOPTS:
2612 if (inp->inp_options)
2613 error = sooptcopyout(sopt,
2614 mtod(inp->inp_options,
2615 char *),
2616 inp->inp_options->m_len);
2617 else
2618 sopt->sopt_valsize = 0;
2619 break;
2620
2621 case IP_TOS:
2622 case IP_TTL:
2623 case IP_RECVOPTS:
2624 case IP_RECVRETOPTS:
2625 case IP_RECVDSTADDR:
2626 case IP_RECVIF:
2627 case IP_RECVTTL:
2628 case IP_PORTRANGE:
2629 case IP_RECVPKTINFO:
2630 switch (sopt->sopt_name) {
2631
2632 case IP_TOS:
2633 optval = inp->inp_ip_tos;
2634 break;
2635
2636 case IP_TTL:
2637 optval = inp->inp_ip_ttl;
2638 break;
2639
2640 #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
2641
2642 case IP_RECVOPTS:
2643 optval = OPTBIT(INP_RECVOPTS);
2644 break;
2645
2646 case IP_RECVRETOPTS:
2647 optval = OPTBIT(INP_RECVRETOPTS);
2648 break;
2649
2650 case IP_RECVDSTADDR:
2651 optval = OPTBIT(INP_RECVDSTADDR);
2652 break;
2653
2654 case IP_RECVIF:
2655 optval = OPTBIT(INP_RECVIF);
2656 break;
2657
2658 case IP_RECVTTL:
2659 optval = OPTBIT(INP_RECVTTL);
2660 break;
2661
2662 case IP_PORTRANGE:
2663 if (inp->inp_flags & INP_HIGHPORT)
2664 optval = IP_PORTRANGE_HIGH;
2665 else if (inp->inp_flags & INP_LOWPORT)
2666 optval = IP_PORTRANGE_LOW;
2667 else
2668 optval = 0;
2669 break;
2670
2671 case IP_RECVPKTINFO:
2672 optval = OPTBIT(INP_PKTINFO);
2673 break;
2674 }
2675 error = sooptcopyout(sopt, &optval, sizeof optval);
2676 break;
2677
2678 case IP_MULTICAST_IF:
2679 case IP_MULTICAST_IFINDEX:
2680 case IP_MULTICAST_VIF:
2681 case IP_MULTICAST_TTL:
2682 case IP_MULTICAST_LOOP:
2683 case IP_MSFILTER:
2684 error = inp_getmoptions(inp, sopt);
2685 break;
2686
2687 #if IPSEC
2688 case IP_IPSEC_POLICY:
2689 {
2690 struct mbuf *m = NULL;
2691 caddr_t req = NULL;
2692 size_t len = 0;
2693
2694 if (m != 0) {
2695 req = mtod(m, caddr_t);
2696 len = m->m_len;
2697 }
2698 error = ipsec4_get_policy(sotoinpcb(so), req, len, &m);
2699 if (error == 0)
2700 error = soopt_mcopyout(sopt, m); /* XXX */
2701 if (error == 0)
2702 m_freem(m);
2703 break;
2704 }
2705 #endif /*IPSEC*/
2706
2707 #if TRAFFIC_MGT
2708 case IP_TRAFFIC_MGT_BACKGROUND:
2709 {
2710 unsigned background = (so->so_traffic_mgt_flags & TRAFFIC_MGT_SO_BACKGROUND);
2711 return (sooptcopyout(sopt, &background, sizeof(background)));
2712 break;
2713 }
2714 #endif /* TRAFFIC_MGT */
2715
2716 case IP_BOUND_IF:
2717 if (inp->inp_flags & INP_BOUND_IF)
2718 optval = inp->inp_boundifp->if_index;
2719 error = sooptcopyout(sopt, &optval, sizeof (optval));
2720 break;
2721
2722 case IP_NO_IFT_CELLULAR:
2723 optval = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
2724 error = sooptcopyout(sopt, &optval, sizeof (optval));
2725 break;
2726
2727 case IP_OUT_IF:
2728 optval = (inp->inp_last_outifp != NULL) ?
2729 inp->inp_last_outifp->if_index : 0;
2730 error = sooptcopyout(sopt, &optval, sizeof (optval));
2731 break;
2732
2733 default:
2734 error = ENOPROTOOPT;
2735 break;
2736 }
2737 break;
2738 }
2739 return (error);
2740 }
2741
2742 /*
2743 * Set up IP options in pcb for insertion in output packets.
2744 * Store in mbuf with pointer in pcbopt, adding pseudo-option
2745 * with destination address if source routed.
2746 */
2747 static int
2748 ip_pcbopts(
2749 __unused int optname,
2750 struct mbuf **pcbopt,
2751 register struct mbuf *m)
2752 {
2753 register int cnt, optlen;
2754 register u_char *cp;
2755 u_char opt;
2756
2757 /* turn off any old options */
2758 if (*pcbopt)
2759 (void)m_free(*pcbopt);
2760 *pcbopt = 0;
2761 if (m == (struct mbuf *)0 || m->m_len == 0) {
2762 /*
2763 * Only turning off any previous options.
2764 */
2765 if (m)
2766 (void)m_free(m);
2767 return (0);
2768 }
2769
2770 #ifndef vax
2771 if (m->m_len % sizeof(int32_t))
2772 goto bad;
2773 #endif
2774 /*
2775 * IP first-hop destination address will be stored before
2776 * actual options; move other options back
2777 * and clear it when none present.
2778 */
2779 if (m->m_data + m->m_len + sizeof(struct in_addr) >= &m->m_dat[MLEN])
2780 goto bad;
2781 cnt = m->m_len;
2782 m->m_len += sizeof(struct in_addr);
2783 cp = mtod(m, u_char *) + sizeof(struct in_addr);
2784 ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2785 bzero(mtod(m, caddr_t), sizeof(struct in_addr));
2786
2787 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2788 opt = cp[IPOPT_OPTVAL];
2789 if (opt == IPOPT_EOL)
2790 break;
2791 if (opt == IPOPT_NOP)
2792 optlen = 1;
2793 else {
2794 if (cnt < IPOPT_OLEN + sizeof(*cp))
2795 goto bad;
2796 optlen = cp[IPOPT_OLEN];
2797 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt)
2798 goto bad;
2799 }
2800 switch (opt) {
2801
2802 default:
2803 break;
2804
2805 case IPOPT_LSRR:
2806 case IPOPT_SSRR:
2807 /*
2808 * user process specifies route as:
2809 * ->A->B->C->D
2810 * D must be our final destination (but we can't
2811 * check that since we may not have connected yet).
2812 * A is first hop destination, which doesn't appear in
2813 * actual IP option, but is stored before the options.
2814 */
2815 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr))
2816 goto bad;
2817 m->m_len -= sizeof(struct in_addr);
2818 cnt -= sizeof(struct in_addr);
2819 optlen -= sizeof(struct in_addr);
2820 cp[IPOPT_OLEN] = optlen;
2821 /*
2822 * Move first hop before start of options.
2823 */
2824 bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
2825 sizeof(struct in_addr));
2826 /*
2827 * Then copy rest of options back
2828 * to close up the deleted entry.
2829 */
2830 ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
2831 sizeof(struct in_addr)),
2832 (caddr_t)&cp[IPOPT_OFFSET+1],
2833 (unsigned)cnt + sizeof(struct in_addr));
2834 break;
2835 }
2836 }
2837 if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr))
2838 goto bad;
2839 *pcbopt = m;
2840 return (0);
2841
2842 bad:
2843 (void)m_free(m);
2844 return (EINVAL);
2845 }
2846
2847 void
2848 ip_moptions_init(void)
2849 {
2850 PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug));
2851
2852 imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) :
2853 sizeof (struct ip_moptions_dbg);
2854
2855 imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0,
2856 IMO_ZONE_NAME);
2857 if (imo_zone == NULL) {
2858 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME);
2859 /* NOTREACHED */
2860 }
2861 zone_change(imo_zone, Z_EXPAND, TRUE);
2862 }
2863
2864 void
2865 imo_addref(struct ip_moptions *imo, int locked)
2866 {
2867 if (!locked)
2868 IMO_LOCK(imo);
2869 else
2870 IMO_LOCK_ASSERT_HELD(imo);
2871
2872 if (++imo->imo_refcnt == 0) {
2873 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
2874 /* NOTREACHED */
2875 } else if (imo->imo_trace != NULL) {
2876 (*imo->imo_trace)(imo, TRUE);
2877 }
2878
2879 if (!locked)
2880 IMO_UNLOCK(imo);
2881 }
2882
2883 void
2884 imo_remref(struct ip_moptions *imo)
2885 {
2886 int i;
2887
2888 IMO_LOCK(imo);
2889 if (imo->imo_refcnt == 0) {
2890 panic("%s: imo %p negative refcnt", __func__, imo);
2891 /* NOTREACHED */
2892 } else if (imo->imo_trace != NULL) {
2893 (*imo->imo_trace)(imo, FALSE);
2894 }
2895
2896 --imo->imo_refcnt;
2897 if (imo->imo_refcnt > 0) {
2898 IMO_UNLOCK(imo);
2899 return;
2900 }
2901
2902 for (i = 0; i < imo->imo_num_memberships; ++i) {
2903 struct in_mfilter *imf;
2904
2905 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
2906 if (imf != NULL)
2907 imf_leave(imf);
2908
2909 (void) in_leavegroup(imo->imo_membership[i], imf);
2910
2911 if (imf != NULL)
2912 imf_purge(imf);
2913
2914 INM_REMREF(imo->imo_membership[i]);
2915 imo->imo_membership[i] = NULL;
2916 }
2917 imo->imo_num_memberships = 0;
2918 if (imo->imo_mfilters != NULL) {
2919 FREE(imo->imo_mfilters, M_INMFILTER);
2920 imo->imo_mfilters = NULL;
2921 }
2922 if (imo->imo_membership != NULL) {
2923 FREE(imo->imo_membership, M_IPMOPTS);
2924 imo->imo_membership = NULL;
2925 }
2926 IMO_UNLOCK(imo);
2927
2928 lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
2929
2930 if (!(imo->imo_debug & IFD_ALLOC)) {
2931 panic("%s: imo %p cannot be freed", __func__, imo);
2932 /* NOTREACHED */
2933 }
2934 zfree(imo_zone, imo);
2935 }
2936
2937 static void
2938 imo_trace(struct ip_moptions *imo, int refhold)
2939 {
2940 struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2941 ctrace_t *tr;
2942 u_int32_t idx;
2943 u_int16_t *cnt;
2944
2945 if (!(imo->imo_debug & IFD_DEBUG)) {
2946 panic("%s: imo %p has no debug structure", __func__, imo);
2947 /* NOTREACHED */
2948 }
2949 if (refhold) {
2950 cnt = &imo_dbg->imo_refhold_cnt;
2951 tr = imo_dbg->imo_refhold;
2952 } else {
2953 cnt = &imo_dbg->imo_refrele_cnt;
2954 tr = imo_dbg->imo_refrele;
2955 }
2956
2957 idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
2958 ctrace_record(&tr[idx]);
2959 }
2960
2961 struct ip_moptions *
2962 ip_allocmoptions(int how)
2963 {
2964 struct ip_moptions *imo;
2965
2966 imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone);
2967 if (imo != NULL) {
2968 bzero(imo, imo_size);
2969 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
2970 imo->imo_debug |= IFD_ALLOC;
2971 if (imo_debug != 0) {
2972 imo->imo_debug |= IFD_DEBUG;
2973 imo->imo_trace = imo_trace;
2974 }
2975 IMO_ADDREF(imo);
2976 }
2977
2978 return (imo);
2979 }
2980
2981 /*
2982 * Routine called from ip_output() to loop back a copy of an IP multicast
2983 * packet to the input queue of a specified interface. Note that this
2984 * calls the output routine of the loopback "driver", but with an interface
2985 * pointer that might NOT be a loopback interface -- evil, but easier than
2986 * replicating that code here.
2987 */
2988 static void
2989 ip_mloopback(ifp, m, dst, hlen)
2990 struct ifnet *ifp;
2991 register struct mbuf *m;
2992 register struct sockaddr_in *dst;
2993 int hlen;
2994 {
2995 register struct ip *ip;
2996 struct mbuf *copym;
2997 int sw_csum = (apple_hwcksum_tx == 0);
2998
2999 copym = m_copy(m, 0, M_COPYALL);
3000 if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < hlen))
3001 copym = m_pullup(copym, hlen);
3002
3003 if (copym == NULL)
3004 return;
3005
3006 /*
3007 * We don't bother to fragment if the IP length is greater
3008 * than the interface's MTU. Can this possibly matter?
3009 */
3010 ip = mtod(copym, struct ip *);
3011
3012 #if BYTE_ORDER != BIG_ENDIAN
3013 HTONS(ip->ip_len);
3014 HTONS(ip->ip_off);
3015 #endif
3016
3017 ip->ip_sum = 0;
3018 ip->ip_sum = in_cksum(copym, hlen);
3019 /*
3020 * NB:
3021 * It's not clear whether there are any lingering
3022 * reentrancy problems in other areas which might
3023 * be exposed by using ip_input directly (in
3024 * particular, everything which modifies the packet
3025 * in-place). Yet another option is using the
3026 * protosw directly to deliver the looped back
3027 * packet. For the moment, we'll err on the side
3028 * of safety by using if_simloop().
3029 */
3030 #if 1 /* XXX */
3031 if (dst->sin_family != AF_INET) {
3032 printf("ip_mloopback: bad address family %d\n",
3033 dst->sin_family);
3034 dst->sin_family = AF_INET;
3035 }
3036 #endif
3037
3038 /*
3039 * Mark checksum as valid or calculate checksum for loopback.
3040 *
3041 * This is done this way because we have to embed the ifp of
3042 * the interface we will send the original copy of the packet
3043 * out on in the mbuf. ip_input will check if_hwassist of the
3044 * embedded ifp and ignore all csum_flags if if_hwassist is 0.
3045 * The UDP checksum has not been calculated yet.
3046 */
3047 if (sw_csum || (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) {
3048 if (!sw_csum && IF_HWASSIST_CSUM_FLAGS(ifp->if_hwassist)) {
3049 copym->m_pkthdr.csum_flags |=
3050 CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
3051 CSUM_IP_CHECKED | CSUM_IP_VALID;
3052 copym->m_pkthdr.csum_data = 0xffff;
3053 } else {
3054
3055 #if BYTE_ORDER != BIG_ENDIAN
3056 NTOHS(ip->ip_len);
3057 #endif
3058
3059 in_delayed_cksum(copym);
3060
3061 #if BYTE_ORDER != BIG_ENDIAN
3062 HTONS(ip->ip_len);
3063 #endif
3064
3065 }
3066 }
3067
3068 /*
3069 * TedW:
3070 * We need to send all loopback traffic down to dlil in case
3071 * a filter has tapped-in.
3072 */
3073
3074 /*
3075 * Stuff the 'real' ifp into the pkthdr, to be used in matching
3076 * in ip_input(); we need the loopback ifp/dl_tag passed as args
3077 * to make the loopback driver compliant with the data link
3078 * requirements.
3079 */
3080 if (lo_ifp) {
3081 copym->m_pkthdr.rcvif = ifp;
3082 dlil_output(lo_ifp, PF_INET, copym, 0,
3083 (struct sockaddr *) dst, 0, NULL);
3084 } else {
3085 printf("Warning: ip_output call to dlil_find_dltag failed!\n");
3086 m_freem(copym);
3087 }
3088 }
3089
3090 /*
3091 * Given a source IP address (and route, if available), determine the best
3092 * interface to send the packet from. Checking for (and updating) the
3093 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3094 * without any locks based on the assumption that ip_output() is single-
3095 * threaded per-pcb, i.e. for any given pcb there can only be one thread
3096 * performing output at the IP layer.
3097 *
3098 * This routine is analogous to in6_selectroute() for IPv6.
3099 */
3100 static struct ifaddr *
3101 in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3102 {
3103 struct ifaddr *ifa = NULL;
3104 struct in_addr src = ip->ip_src;
3105 struct in_addr dst = ip->ip_dst;
3106 struct ifnet *rt_ifp;
3107 char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3108
3109 if (ip_select_srcif_debug) {
3110 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src));
3111 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst));
3112 }
3113
3114 if (ro->ro_rt != NULL)
3115 RT_LOCK(ro->ro_rt);
3116
3117 rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3118
3119 /*
3120 * Given the source IP address, find a suitable source interface
3121 * to use for transmission; if the caller has specified a scope,
3122 * optimize the search by looking at the addresses only for that
3123 * interface. This is still suboptimal, however, as we need to
3124 * traverse the per-interface list.
3125 */
3126 if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3127 unsigned int scope = ifscope;
3128
3129 /*
3130 * If no scope is specified and the route is stale (pointing
3131 * to a defunct interface) use the current primary interface;
3132 * this happens when switching between interfaces configured
3133 * with the same IP address. Otherwise pick up the scope
3134 * information from the route; the ULP may have looked up a
3135 * correct route and we just need to verify it here and mark
3136 * it with the ROF_SRCIF_SELECTED flag below.
3137 */
3138 if (scope == IFSCOPE_NONE) {
3139 scope = rt_ifp->if_index;
3140 if (scope != get_primary_ifscope(AF_INET) &&
3141 ro->ro_rt->generation_id != route_generation)
3142 scope = get_primary_ifscope(AF_INET);
3143 }
3144
3145 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3146
3147 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3148 ip->ip_p != IPPROTO_TCP && ipforwarding) {
3149 /*
3150 * If forwarding is enabled, and if the packet isn't
3151 * TCP or UDP, check if the source address belongs
3152 * to one of our own interfaces; if so, demote the
3153 * interface scope and do a route lookup right below.
3154 */
3155 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3156 if (ifa != NULL) {
3157 IFA_REMREF(ifa);
3158 ifa = NULL;
3159 ifscope = IFSCOPE_NONE;
3160 }
3161 }
3162
3163 if (ip_select_srcif_debug && ifa != NULL) {
3164 if (ro->ro_rt != NULL) {
3165 printf("%s->%s ifscope %d->%d ifa_if %s "
3166 "ro_if %s\n", s_src, s_dst, ifscope,
3167 scope, if_name(ifa->ifa_ifp),
3168 if_name(rt_ifp));
3169 } else {
3170 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3171 s_src, s_dst, ifscope, scope,
3172 if_name(ifa->ifa_ifp));
3173 }
3174 }
3175 }
3176
3177 /*
3178 * Slow path; search for an interface having the corresponding source
3179 * IP address if the scope was not specified by the caller, and:
3180 *
3181 * 1) There currently isn't any route, or,
3182 * 2) The interface used by the route does not own that source
3183 * IP address; in this case, the route will get blown away
3184 * and we'll do a more specific scoped search using the newly
3185 * found interface.
3186 */
3187 if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3188 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3189
3190 /*
3191 * If we have the IP address, but not the route, we don't
3192 * really know whether or not it belongs to the correct
3193 * interface (it could be shared across multiple interfaces.)
3194 * The only way to find out is to do a route lookup.
3195 */
3196 if (ifa != NULL && ro->ro_rt == NULL) {
3197 struct rtentry *rt;
3198 struct sockaddr_in sin;
3199 struct ifaddr *oifa = NULL;
3200
3201 bzero(&sin, sizeof (sin));
3202 sin.sin_family = AF_INET;
3203 sin.sin_len = sizeof (sin);
3204 sin.sin_addr = dst;
3205
3206 lck_mtx_lock(rnh_lock);
3207 if ((rt = rt_lookup(TRUE, (struct sockaddr *)&sin, NULL,
3208 rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3209 RT_LOCK(rt);
3210 /*
3211 * If the route uses a different interface,
3212 * use that one instead. The IP address of
3213 * the ifaddr that we pick up here is not
3214 * relevant.
3215 */
3216 if (ifa->ifa_ifp != rt->rt_ifp) {
3217 oifa = ifa;
3218 ifa = rt->rt_ifa;
3219 IFA_ADDREF(ifa);
3220 RT_UNLOCK(rt);
3221 } else {
3222 RT_UNLOCK(rt);
3223 }
3224 rtfree_locked(rt);
3225 }
3226 lck_mtx_unlock(rnh_lock);
3227
3228 if (oifa != NULL) {
3229 struct ifaddr *iifa;
3230
3231 /*
3232 * See if the interface pointed to by the
3233 * route is configured with the source IP
3234 * address of the packet.
3235 */
3236 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3237 src.s_addr, ifa->ifa_ifp->if_index);
3238
3239 if (iifa != NULL) {
3240 /*
3241 * Found it; drop the original one
3242 * as well as the route interface
3243 * address, and use this instead.
3244 */
3245 IFA_REMREF(oifa);
3246 IFA_REMREF(ifa);
3247 ifa = iifa;
3248 } else if (!ipforwarding ||
3249 (rt->rt_flags & RTF_GATEWAY)) {
3250 /*
3251 * This interface doesn't have that
3252 * source IP address; drop the route
3253 * interface address and just use the
3254 * original one, and let the caller
3255 * do a scoped route lookup.
3256 */
3257 IFA_REMREF(ifa);
3258 ifa = oifa;
3259 } else {
3260 /*
3261 * Forwarding is enabled and the source
3262 * address belongs to one of our own
3263 * interfaces which isn't the outgoing
3264 * interface, and we have a route, and
3265 * the destination is on a network that
3266 * is directly attached (onlink); drop
3267 * the original one and use the route
3268 * interface address instead.
3269 */
3270 IFA_REMREF(oifa);
3271 }
3272 }
3273 } else if (ifa != NULL && ro->ro_rt != NULL &&
3274 !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3275 ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3276 /*
3277 * Forwarding is enabled and the source address belongs
3278 * to one of our own interfaces which isn't the same
3279 * as the interface used by the known route; drop the
3280 * original one and use the route interface address.
3281 */
3282 IFA_REMREF(ifa);
3283 ifa = ro->ro_rt->rt_ifa;
3284 IFA_ADDREF(ifa);
3285 }
3286
3287 if (ip_select_srcif_debug && ifa != NULL) {
3288 printf("%s->%s ifscope %d ifa_if %s\n",
3289 s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3290 }
3291 }
3292
3293 if (ro->ro_rt != NULL)
3294 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3295 /*
3296 * If there is a non-loopback route with the wrong interface, or if
3297 * there is no interface configured with such an address, blow it
3298 * away. Except for local/loopback, we look for one with a matching
3299 * interface scope/index.
3300 */
3301 if (ro->ro_rt != NULL &&
3302 (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3303 !(ro->ro_rt->rt_flags & RTF_UP))) {
3304 if (ip_select_srcif_debug) {
3305 if (ifa != NULL) {
3306 printf("%s->%s ifscope %d ro_if %s != "
3307 "ifa_if %s (cached route cleared)\n",
3308 s_src, s_dst, ifscope, if_name(rt_ifp),
3309 if_name(ifa->ifa_ifp));
3310 } else {
3311 printf("%s->%s ifscope %d ro_if %s "
3312 "(no ifa_if found)\n",
3313 s_src, s_dst, ifscope, if_name(rt_ifp));
3314 }
3315 }
3316
3317 RT_UNLOCK(ro->ro_rt);
3318 rtfree(ro->ro_rt);
3319 ro->ro_rt = NULL;
3320 ro->ro_flags &= ~ROF_SRCIF_SELECTED;
3321
3322 /*
3323 * If the destination is IPv4 LLA and the route's interface
3324 * doesn't match the source interface, then the source IP
3325 * address is wrong; it most likely belongs to the primary
3326 * interface associated with the IPv4 LL subnet. Drop the
3327 * packet rather than letting it go out and return an error
3328 * to the ULP. This actually applies not only to IPv4 LL
3329 * but other shared subnets; for now we explicitly test only
3330 * for the former case and save the latter for future.
3331 */
3332 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3333 !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3334 IFA_REMREF(ifa);
3335 ifa = NULL;
3336 }
3337 }
3338
3339 if (ip_select_srcif_debug && ifa == NULL) {
3340 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3341 s_src, s_dst, ifscope);
3342 }
3343
3344 /*
3345 * If there is a route, mark it accordingly. If there isn't one,
3346 * we'll get here again during the next transmit (possibly with a
3347 * route) and the flag will get set at that point. For IPv4 LLA
3348 * destination, mark it only if the route has been fully resolved;
3349 * otherwise we want to come back here again when the route points
3350 * to the interface over which the ARP reply arrives on.
3351 */
3352 if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3353 (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3354 SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3355 ro->ro_flags |= ROF_SRCIF_SELECTED;
3356 ro->ro_rt->generation_id = route_generation;
3357 }
3358
3359 if (ro->ro_rt != NULL)
3360 RT_UNLOCK(ro->ro_rt);
3361
3362 return (ifa);
3363 }