]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/in_pcb.c
485e8dbcd0c3f691b21ac565358a35031db0708d
[apple/xnu.git] / bsd / netinet / in_pcb.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1991, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62 */
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #ifndef __APPLE__
74 #include <sys/jail.h>
75 #endif
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/mcache.h>
79 #include <sys/kauth.h>
80 #include <sys/priv.h>
81 #include <libkern/OSAtomic.h>
82 #include <kern/locks.h>
83
84 #include <machine/limits.h>
85
86 #ifdef __APPLE__
87 #include <kern/zalloc.h>
88 #endif
89
90 #include <net/if.h>
91 #include <net/if_types.h>
92 #include <net/route.h>
93 #include <net/flowhash.h>
94 #include <net/flowadv.h>
95
96 #include <netinet/in.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_var.h>
99 #include <netinet/ip_var.h>
100 #if INET6
101 #include <netinet/ip6.h>
102 #include <netinet6/ip6_var.h>
103 #endif /* INET6 */
104
105 #if IPSEC
106 #include <netinet6/ipsec.h>
107 #include <netkey/key.h>
108 #endif /* IPSEC */
109
110 #include <sys/kdebug.h>
111 #include <sys/random.h>
112 #include <dev/random/randomdev.h>
113
114 #if IPSEC
115 extern int ipsec_bypass;
116 #endif
117
118 #define DBG_FNC_PCB_LOOKUP NETDBG_CODE(DBG_NETTCP, (6 << 8))
119 #define DBG_FNC_PCB_HLOOKUP NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
120
121 struct in_addr zeroin_addr;
122
123 /*
124 * These configure the range of local port addresses assigned to
125 * "unspecified" outgoing connections/packets/whatever.
126 */
127 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
128 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
129 #ifndef __APPLE__
130 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */
131 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */
132 #else
133 int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
134 int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
135 #endif
136 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
137 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
138
139 #define RANGECHK(var, min, max) \
140 if ((var) < (min)) { (var) = (min); } \
141 else if ((var) > (max)) { (var) = (max); }
142
143 static int
144 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
145 {
146 #pragma unused(arg1, arg2)
147 int error = sysctl_handle_int(oidp,
148 oidp->oid_arg1, oidp->oid_arg2, req);
149 if (!error) {
150 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
151 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
152 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
153 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
154 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
155 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
156 }
157 return error;
158 }
159
160 #undef RANGECHK
161
162 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IP Ports");
163
164 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
165 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
166 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
167 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
169 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
171 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
173 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
175 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
176
177 extern int udp_use_randomport;
178 extern int tcp_use_randomport;
179
180 /* Structs used for flowhash computation */
181 struct inp_flowhash_key_addr {
182 union {
183 struct in_addr v4;
184 struct in6_addr v6;
185 u_int8_t addr8[16];
186 u_int16_t addr16[8];
187 u_int32_t addr32[4];
188 } infha;
189 };
190
191 struct inp_flowhash_key {
192 struct inp_flowhash_key_addr infh_laddr;
193 struct inp_flowhash_key_addr infh_faddr;
194 u_int32_t infh_lport;
195 u_int32_t infh_fport;
196 u_int32_t infh_af;
197 u_int32_t infh_proto;
198 u_int32_t infh_rand1;
199 u_int32_t infh_rand2;
200 };
201
202 u_int32_t inp_hash_seed = 0;
203
204 static __inline int infc_cmp(const struct inpcb *,
205 const struct inpcb *);
206 lck_grp_t *inp_lck_grp;
207 lck_grp_attr_t *inp_lck_grp_attr;
208 lck_attr_t *inp_lck_attr;
209 decl_lck_mtx_data(, inp_fc_lck);
210
211 RB_HEAD(inp_fc_tree, inpcb) inp_fc_tree;
212 RB_PROTOTYPE(inp_fc_tree, inpcb, infc_link, infc_cmp);
213 RB_GENERATE(inp_fc_tree, inpcb, infc_link, infc_cmp);
214
215 /*
216 * Use this inp as a key to find an inp in the flowhash tree.
217 * Accesses to it are protected by inp_fc_lck.
218 */
219 struct inpcb key_inp;
220
221 /*
222 * in_pcb.c: manage the Protocol Control Blocks.
223 */
224
225 /*
226 * Initialize data structures required to deliver
227 * flow advisories.
228 */
229 void
230 socket_flowadv_init(void)
231 {
232 inp_lck_grp_attr = lck_grp_attr_alloc_init();
233 inp_lck_grp = lck_grp_alloc_init("inp_lck_grp", inp_lck_grp_attr);
234
235 inp_lck_attr = lck_attr_alloc_init();
236 lck_mtx_init(&inp_fc_lck, inp_lck_grp, inp_lck_attr);
237
238 lck_mtx_lock(&inp_fc_lck);
239 RB_INIT(&inp_fc_tree);
240 bzero(&key_inp, sizeof(key_inp));
241 lck_mtx_unlock(&inp_fc_lck);
242 }
243
244 /*
245 * Allocate a PCB and associate it with the socket.
246 *
247 * Returns: 0 Success
248 * ENOBUFS
249 * ENOMEM
250 * ipsec_init_policy:??? [IPSEC]
251 */
252 int
253 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc *p)
254 {
255 struct inpcb *inp;
256 caddr_t temp;
257 #if IPSEC
258 #ifndef __APPLE__
259 int error;
260 #endif
261 #endif
262 #if CONFIG_MACF_NET
263 int mac_error;
264 #endif
265
266 if (so->cached_in_sock_layer == 0) {
267 #if TEMPDEBUG
268 printf("PCBALLOC calling zalloc for socket %x\n", so);
269 #endif
270 inp = (struct inpcb *) zalloc(pcbinfo->ipi_zone);
271 if (inp == NULL)
272 return (ENOBUFS);
273 bzero((caddr_t)inp, sizeof(*inp));
274 }
275 else {
276 #if TEMPDEBUG
277 printf("PCBALLOC reusing PCB for socket %x\n", so);
278 #endif
279 inp = (struct inpcb *)(void *)so->so_saved_pcb;
280 temp = inp->inp_saved_ppcb;
281 bzero((caddr_t) inp, sizeof(*inp));
282 inp->inp_saved_ppcb = temp;
283 }
284
285 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
286 inp->inp_pcbinfo = pcbinfo;
287 inp->inp_socket = so;
288 #if CONFIG_MACF_NET
289 mac_error = mac_inpcb_label_init(inp, M_WAITOK);
290 if (mac_error != 0) {
291 if (so->cached_in_sock_layer == 0)
292 zfree(pcbinfo->ipi_zone, inp);
293 return (mac_error);
294 }
295 mac_inpcb_label_associate(so, inp);
296 #endif
297 // make sure inp_stat is always 64bit aligned
298 inp->inp_stat = (struct inp_stat*)P2ROUNDUP(inp->inp_stat_store, sizeof(u_int64_t));
299 if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store)
300 + sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
301 panic("insufficient space to align inp_stat");
302 }
303
304 so->so_pcb = (caddr_t)inp;
305
306 if (so->so_proto->pr_flags & PR_PCBLOCK) {
307 lck_mtx_init(&inp->inpcb_mtx, pcbinfo->mtx_grp, pcbinfo->mtx_attr);
308 }
309
310 #if IPSEC
311 #ifndef __APPLE__
312 if (ipsec_bypass == 0) {
313 error = ipsec_init_policy(so, &inp->inp_sp);
314 if (error != 0) {
315 zfree(pcbinfo->ipi_zone, inp);
316 return error;
317 }
318 }
319 #endif
320 #endif /*IPSEC*/
321 #if INET6
322 if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on)
323 inp->inp_flags |= IN6P_IPV6_V6ONLY;
324 #endif
325
326 #if INET6
327 if (ip6_auto_flowlabel)
328 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
329 #endif
330 lck_rw_lock_exclusive(pcbinfo->mtx);
331 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
332 LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
333 pcbinfo->ipi_count++;
334 lck_rw_done(pcbinfo->mtx);
335 return (0);
336 }
337
338
339 /*
340 in_pcblookup_local_and_cleanup does everything
341 in_pcblookup_local does but it checks for a socket
342 that's going away. Since we know that the lock is
343 held read+write when this funciton is called, we
344 can safely dispose of this socket like the slow
345 timer would usually do and return NULL. This is
346 great for bind.
347 */
348 struct inpcb*
349 in_pcblookup_local_and_cleanup(
350 struct inpcbinfo *pcbinfo,
351 struct in_addr laddr,
352 u_int lport_arg,
353 int wild_okay)
354 {
355 struct inpcb *inp;
356
357 /* Perform normal lookup */
358 inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
359
360 /* Check if we found a match but it's waiting to be disposed */
361 if (inp && inp->inp_wantcnt == WNT_STOPUSING) {
362 struct socket *so = inp->inp_socket;
363
364 lck_mtx_lock(&inp->inpcb_mtx);
365
366 if (so->so_usecount == 0) {
367 if (inp->inp_state != INPCB_STATE_DEAD)
368 in_pcbdetach(inp);
369 in_pcbdispose(inp);
370 inp = NULL;
371 }
372 else {
373 lck_mtx_unlock(&inp->inpcb_mtx);
374 }
375 }
376
377 return inp;
378 }
379
380 #ifdef __APPLE_API_PRIVATE
381 static void
382 in_pcb_conflict_post_msg(u_int16_t port)
383 {
384 /*
385 * Radar 5523020 send a kernel event notification if a non-participating socket tries to bind
386 * the port a socket who has set SOF_NOTIFYCONFLICT owns.
387 */
388 struct kev_msg ev_msg;
389 struct kev_in_portinuse in_portinuse;
390
391 bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
392 bzero(&ev_msg, sizeof(struct kev_msg));
393 in_portinuse.port = ntohs(port); /* port in host order */
394 in_portinuse.req_pid = proc_selfpid();
395 ev_msg.vendor_code = KEV_VENDOR_APPLE;
396 ev_msg.kev_class = KEV_NETWORK_CLASS;
397 ev_msg.kev_subclass = KEV_INET_SUBCLASS;
398 ev_msg.event_code = KEV_INET_PORTINUSE;
399 ev_msg.dv[0].data_ptr = &in_portinuse;
400 ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
401 ev_msg.dv[1].data_length = 0;
402 kev_post_msg(&ev_msg);
403 }
404 #endif
405 /*
406 * Returns: 0 Success
407 * EADDRNOTAVAIL Address not available.
408 * EINVAL Invalid argument
409 * EAFNOSUPPORT Address family not supported [notdef]
410 * EACCES Permission denied
411 * EADDRINUSE Address in use
412 * EAGAIN Resource unavailable, try again
413 * priv_check_cred:EPERM Operation not permitted
414 */
415 int
416 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
417 {
418 struct socket *so = inp->inp_socket;
419 unsigned short *lastport;
420 struct sockaddr_in *sin;
421 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
422 u_short lport = 0, rand_port = 0;
423 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
424 int error, randomport, conflict = 0;
425 kauth_cred_t cred;
426
427 if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
428 return (EADDRNOTAVAIL);
429 if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY)
430 return (EINVAL);
431 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
432 wild = 1;
433 socket_unlock(so, 0); /* keep reference on socket */
434 lck_rw_lock_exclusive(pcbinfo->mtx);
435 if (nam) {
436 struct ifnet *outif = NULL;
437
438 sin = (struct sockaddr_in *)(void *)nam;
439 if (nam->sa_len != sizeof (*sin)) {
440 lck_rw_done(pcbinfo->mtx);
441 socket_lock(so, 0);
442 return (EINVAL);
443 }
444 #ifdef notdef
445 /*
446 * We should check the family, but old programs
447 * incorrectly fail to initialize it.
448 */
449 if (sin->sin_family != AF_INET) {
450 lck_rw_done(pcbinfo->mtx);
451 socket_lock(so, 0);
452 return (EAFNOSUPPORT);
453 }
454 #endif
455 lport = sin->sin_port;
456 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
457 /*
458 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
459 * allow complete duplication of binding if
460 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
461 * and a multicast address is bound on both
462 * new and duplicated sockets.
463 */
464 if (so->so_options & SO_REUSEADDR)
465 reuseport = SO_REUSEADDR|SO_REUSEPORT;
466 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
467 struct ifaddr *ifa;
468 sin->sin_port = 0; /* yech... */
469 if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin)) == 0) {
470 lck_rw_done(pcbinfo->mtx);
471 socket_lock(so, 0);
472 return (EADDRNOTAVAIL);
473 }
474 else {
475 IFA_LOCK(ifa);
476 outif = ifa->ifa_ifp;
477 IFA_UNLOCK(ifa);
478 IFA_REMREF(ifa);
479 }
480 }
481 if (lport) {
482 struct inpcb *t;
483
484 /* GROSS */
485 #if !CONFIG_EMBEDDED
486 if (ntohs(lport) < IPPORT_RESERVED) {
487 cred = kauth_cred_proc_ref(p);
488 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
489 kauth_cred_unref(&cred);
490 if (error != 0) {
491 lck_rw_done(pcbinfo->mtx);
492 socket_lock(so, 0);
493 return (EACCES);
494 }
495 }
496 #endif
497 if (kauth_cred_getuid(so->so_cred) &&
498 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
499 t = in_pcblookup_local_and_cleanup(inp->inp_pcbinfo,
500 sin->sin_addr, lport, INPLOOKUP_WILDCARD);
501 if (t &&
502 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
503 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
504 (t->inp_socket->so_options &
505 SO_REUSEPORT) == 0) &&
506 (kauth_cred_getuid(so->so_cred) !=
507 kauth_cred_getuid(t->inp_socket->so_cred)) &&
508 ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0) &&
509 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
510 ntohl(t->inp_laddr.s_addr) != INADDR_ANY))
511 {
512 #ifdef __APPLE_API_PRIVATE
513
514 if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
515 conflict = 1;
516
517 lck_rw_done(pcbinfo->mtx);
518
519 if (conflict)
520 in_pcb_conflict_post_msg(lport);
521 #else
522 lck_rw_done(pcbinfo->mtx);
523 #endif /* __APPLE_API_PRIVATE */
524
525 socket_lock(so, 0);
526 return (EADDRINUSE);
527 }
528 }
529 t = in_pcblookup_local_and_cleanup(pcbinfo, sin->sin_addr,
530 lport, wild);
531 if (t &&
532 (reuseport & t->inp_socket->so_options) == 0) {
533 #if INET6
534 if (ntohl(sin->sin_addr.s_addr) !=
535 INADDR_ANY ||
536 ntohl(t->inp_laddr.s_addr) !=
537 INADDR_ANY ||
538 INP_SOCKAF(so) != AF_INET6 ||
539 INP_SOCKAF(t->inp_socket) != AF_INET6)
540 #endif /* INET6 */
541 {
542 #ifdef __APPLE_API_PRIVATE
543
544 if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
545 conflict = 1;
546
547 lck_rw_done(pcbinfo->mtx);
548
549 if (conflict)
550 in_pcb_conflict_post_msg(lport);
551 #else
552 lck_rw_done(pcbinfo->mtx);
553 #endif /* __APPLE_API_PRIVATE */
554 socket_lock(so, 0);
555 return (EADDRINUSE);
556 }
557 }
558 }
559 inp->inp_laddr = sin->sin_addr;
560 inp->inp_last_outifp = outif;
561 }
562 if (lport == 0) {
563 u_short first, last;
564 int count;
565
566 randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
567 (so->so_type == SOCK_STREAM ? tcp_use_randomport : udp_use_randomport);
568
569 inp->inp_flags |= INP_ANONPORT;
570
571 if (inp->inp_flags & INP_HIGHPORT) {
572 first = ipport_hifirstauto; /* sysctl */
573 last = ipport_hilastauto;
574 lastport = &pcbinfo->lasthi;
575 } else if (inp->inp_flags & INP_LOWPORT) {
576 cred = kauth_cred_proc_ref(p);
577 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
578 kauth_cred_unref(&cred);
579 if (error != 0) {
580 lck_rw_done(pcbinfo->mtx);
581 socket_lock(so, 0);
582 return error;
583 }
584 first = ipport_lowfirstauto; /* 1023 */
585 last = ipport_lowlastauto; /* 600 */
586 lastport = &pcbinfo->lastlow;
587 } else {
588 first = ipport_firstauto; /* sysctl */
589 last = ipport_lastauto;
590 lastport = &pcbinfo->lastport;
591 }
592 /* No point in randomizing if only one port is available */
593
594 if (first == last)
595 randomport = 0;
596 /*
597 * Simple check to ensure all ports are not used up causing
598 * a deadlock here.
599 *
600 * We split the two cases (up and down) so that the direction
601 * is not being tested on each round of the loop.
602 */
603 if (first > last) {
604 /*
605 * counting down
606 */
607 if (randomport) {
608 read_random(&rand_port, sizeof(rand_port));
609 *lastport = first - (rand_port % (first - last));
610 }
611 count = first - last;
612
613 do {
614 if (count-- < 0) { /* completely used? */
615 lck_rw_done(pcbinfo->mtx);
616 socket_lock(so, 0);
617 inp->inp_laddr.s_addr = INADDR_ANY;
618 inp->inp_last_outifp = NULL;
619 return (EADDRNOTAVAIL);
620 }
621 --*lastport;
622 if (*lastport > first || *lastport < last)
623 *lastport = first;
624 lport = htons(*lastport);
625 } while (in_pcblookup_local_and_cleanup(pcbinfo,
626 inp->inp_laddr, lport, wild));
627 } else {
628 /*
629 * counting up
630 */
631 if (randomport) {
632 read_random(&rand_port, sizeof(rand_port));
633 *lastport = first + (rand_port % (first - last));
634 }
635 count = last - first;
636
637 do {
638 if (count-- < 0) { /* completely used? */
639 lck_rw_done(pcbinfo->mtx);
640 socket_lock(so, 0);
641 inp->inp_laddr.s_addr = INADDR_ANY;
642 inp->inp_last_outifp = NULL;
643 return (EADDRNOTAVAIL);
644 }
645 ++*lastport;
646 if (*lastport < first || *lastport > last)
647 *lastport = first;
648 lport = htons(*lastport);
649 } while (in_pcblookup_local_and_cleanup(pcbinfo,
650 inp->inp_laddr, lport, wild));
651 }
652 }
653 socket_lock(so, 0);
654 inp->inp_lport = lport;
655 if (in_pcbinshash(inp, 1) != 0) {
656 inp->inp_laddr.s_addr = INADDR_ANY;
657 inp->inp_lport = 0;
658 inp->inp_last_outifp = NULL;
659 lck_rw_done(pcbinfo->mtx);
660 return (EAGAIN);
661 }
662 lck_rw_done(pcbinfo->mtx);
663 sflt_notify(so, sock_evt_bound, NULL);
664 return (0);
665 }
666
667 /*
668 * Transform old in_pcbconnect() into an inner subroutine for new
669 * in_pcbconnect(): Do some validity-checking on the remote
670 * address (in mbuf 'nam') and then determine local host address
671 * (i.e., which interface) to use to access that remote host.
672 *
673 * This preserves definition of in_pcbconnect(), while supporting a
674 * slightly different version for T/TCP. (This is more than
675 * a bit of a kludge, but cleaning up the internal interfaces would
676 * have forced minor changes in every protocol).
677 *
678 * Returns: 0 Success
679 * EINVAL Invalid argument
680 * EAFNOSUPPORT Address family not supported
681 * EADDRNOTAVAIL Address not available
682 */
683 int
684 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
685 struct sockaddr_in *plocal_sin, struct ifnet **outif)
686 {
687 struct in_ifaddr *ia;
688 struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
689
690 if (nam->sa_len != sizeof (*sin))
691 return (EINVAL);
692 if (sin->sin_family != AF_INET)
693 return (EAFNOSUPPORT);
694 if (sin->sin_port == 0)
695 return (EADDRNOTAVAIL);
696
697 lck_rw_lock_shared(in_ifaddr_rwlock);
698 if (!TAILQ_EMPTY(&in_ifaddrhead)) {
699 ia = TAILQ_FIRST(&in_ifaddrhead);
700 /*
701 * If the destination address is INADDR_ANY,
702 * use the primary local address.
703 * If the supplied address is INADDR_BROADCAST,
704 * and the primary interface supports broadcast,
705 * choose the broadcast address for that interface.
706 */
707 IFA_LOCK_SPIN(&ia->ia_ifa);
708 if (sin->sin_addr.s_addr == INADDR_ANY)
709 sin->sin_addr = IA_SIN(ia)->sin_addr;
710 else if (sin->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST &&
711 (ia->ia_ifp->if_flags & IFF_BROADCAST))
712 sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
713 IFA_UNLOCK(&ia->ia_ifa);
714 ia = NULL;
715 }
716 lck_rw_done(in_ifaddr_rwlock);
717
718 if (inp->inp_laddr.s_addr == INADDR_ANY) {
719 struct route *ro;
720 unsigned int ifscope = IFSCOPE_NONE;
721 unsigned int nocell;
722 /*
723 * If the socket is bound to a specifc interface, the
724 * optional scoped takes precedence over that if it
725 * is set by the caller.
726 */
727 ia = (struct in_ifaddr *)0;
728
729 if (outif != NULL && *outif != NULL)
730 ifscope = (*outif)->if_index;
731 else if (inp->inp_flags & INP_BOUND_IF)
732 ifscope = inp->inp_boundifp->if_index;
733
734 nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
735 /*
736 * If route is known or can be allocated now,
737 * our src addr is taken from the i/f, else punt.
738 * Note that we should check the address family of the cached
739 * destination, in case of sharing the cache with IPv6.
740 */
741 ro = &inp->inp_route;
742 if (ro->ro_rt != NULL)
743 RT_LOCK_SPIN(ro->ro_rt);
744 if (ro->ro_rt && (ro->ro_dst.sa_family != AF_INET ||
745 satosin(&ro->ro_dst)->sin_addr.s_addr !=
746 sin->sin_addr.s_addr ||
747 inp->inp_socket->so_options & SO_DONTROUTE ||
748 ro->ro_rt->generation_id != route_generation)) {
749 RT_UNLOCK(ro->ro_rt);
750 rtfree(ro->ro_rt);
751 ro->ro_rt = NULL;
752 }
753 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
754 (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
755 if (ro->ro_rt != NULL)
756 RT_UNLOCK(ro->ro_rt);
757 /* No route yet, so try to acquire one */
758 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
759 ro->ro_dst.sa_family = AF_INET;
760 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
761 ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
762 sin->sin_addr;
763 rtalloc_scoped(ro, ifscope);
764 if (ro->ro_rt != NULL)
765 RT_LOCK_SPIN(ro->ro_rt);
766 }
767 /*
768 * If the route points to a cellular interface and the
769 * caller forbids our using interfaces of such type,
770 * pretend that there is no route.
771 */
772 if (nocell && ro->ro_rt != NULL) {
773 RT_LOCK_ASSERT_HELD(ro->ro_rt);
774 if (ro->ro_rt->rt_ifp->if_type == IFT_CELLULAR) {
775 RT_UNLOCK(ro->ro_rt);
776 rtfree(ro->ro_rt);
777 ro->ro_rt = NULL;
778 soevent(inp->inp_socket,
779 (SO_FILT_HINT_LOCKED |
780 SO_FILT_HINT_IFDENIED));
781 }
782 }
783 /*
784 * If we found a route, use the address
785 * corresponding to the outgoing interface
786 * unless it is the loopback (in case a route
787 * to our address on another net goes to loopback).
788 */
789 if (ro->ro_rt != NULL) {
790 /* Become a regular mutex */
791 RT_CONVERT_LOCK(ro->ro_rt);
792 if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
793 ia = ifatoia(ro->ro_rt->rt_ifa);
794 if (ia) {
795 IFA_ADDREF(&ia->ia_ifa);
796 }
797 }
798 RT_UNLOCK(ro->ro_rt);
799 }
800 if (ia == 0) {
801 u_short fport = sin->sin_port;
802
803 sin->sin_port = 0;
804 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
805 if (ia == 0) {
806 ia = ifatoia(ifa_ifwithnet_scoped(sintosa(sin),
807 ifscope));
808 }
809 sin->sin_port = fport;
810 if (ia == 0) {
811 lck_rw_lock_shared(in_ifaddr_rwlock);
812 ia = TAILQ_FIRST(&in_ifaddrhead);
813 if (ia)
814 IFA_ADDREF(&ia->ia_ifa);
815 lck_rw_done(in_ifaddr_rwlock);
816 }
817 /*
818 * If the source address belongs to a cellular interface
819 * and the socket forbids our using interfaces of such
820 * type, pretend that there is no source address.
821 */
822 if (nocell && ia != NULL &&
823 ia->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) {
824 IFA_REMREF(&ia->ia_ifa);
825 ia = NULL;
826 soevent(inp->inp_socket,
827 (SO_FILT_HINT_LOCKED |
828 SO_FILT_HINT_IFDENIED));
829 }
830 if (ia == 0)
831 return (EADDRNOTAVAIL);
832 }
833 /*
834 * If the destination address is multicast and an outgoing
835 * interface has been set as a multicast option, use the
836 * address of that interface as our source address.
837 */
838 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
839 inp->inp_moptions != NULL) {
840 struct ip_moptions *imo;
841 struct ifnet *ifp;
842
843 imo = inp->inp_moptions;
844 IMO_LOCK(imo);
845 if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
846 ia->ia_ifp != imo->imo_multicast_ifp)) {
847 ifp = imo->imo_multicast_ifp;
848 if (ia)
849 IFA_REMREF(&ia->ia_ifa);
850 lck_rw_lock_shared(in_ifaddr_rwlock);
851 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
852 if (ia->ia_ifp == ifp)
853 break;
854 }
855 if (ia)
856 IFA_ADDREF(&ia->ia_ifa);
857 lck_rw_done(in_ifaddr_rwlock);
858 if (ia == 0) {
859 IMO_UNLOCK(imo);
860 return (EADDRNOTAVAIL);
861 }
862 }
863 IMO_UNLOCK(imo);
864 }
865 /*
866 * Don't do pcblookup call here; return interface in plocal_sin
867 * and exit to caller, that will do the lookup.
868 */
869 IFA_LOCK_SPIN(&ia->ia_ifa);
870 *plocal_sin = ia->ia_addr;
871 if (outif != NULL)
872 *outif = ia->ia_ifp;
873 IFA_UNLOCK(&ia->ia_ifa);
874 IFA_REMREF(&ia->ia_ifa);
875 }
876 return(0);
877 }
878
879 /*
880 * Outer subroutine:
881 * Connect from a socket to a specified address.
882 * Both address and port must be specified in argument sin.
883 * If don't have a local address for this socket yet,
884 * then pick one.
885 */
886 int
887 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
888 struct ifnet **outif)
889 {
890 struct sockaddr_in ifaddr;
891 struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
892 struct inpcb *pcb;
893 int error;
894
895 /*
896 * Call inner routine, to assign local interface address.
897 */
898 if ((error = in_pcbladdr(inp, nam, &ifaddr, outif)) != 0)
899 return(error);
900
901 socket_unlock(inp->inp_socket, 0);
902 pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
903 inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr.sin_addr,
904 inp->inp_lport, 0, NULL);
905 socket_lock(inp->inp_socket, 0);
906
907 /* Check if the socket is still in a valid state. When we unlock this
908 * embryonic socket, it can get aborted if another thread is closing
909 * the listener (radar 7947600).
910 */
911 if ((inp->inp_socket->so_flags & SOF_ABORTED) != 0) {
912 return ECONNREFUSED;
913 }
914
915 if (pcb != NULL) {
916 in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
917 return (EADDRINUSE);
918 }
919 if (inp->inp_laddr.s_addr == INADDR_ANY) {
920 if (inp->inp_lport == 0) {
921 error = in_pcbbind(inp, (struct sockaddr *)0, p);
922 if (error)
923 return (error);
924 }
925 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
926 /*lock inversion issue, mostly with udp multicast packets */
927 socket_unlock(inp->inp_socket, 0);
928 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
929 socket_lock(inp->inp_socket, 0);
930 }
931 inp->inp_laddr = ifaddr.sin_addr;
932 inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
933 inp->inp_flags |= INP_INADDR_ANY;
934 }
935 else {
936 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
937 /*lock inversion issue, mostly with udp multicast packets */
938 socket_unlock(inp->inp_socket, 0);
939 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
940 socket_lock(inp->inp_socket, 0);
941 }
942 }
943 inp->inp_faddr = sin->sin_addr;
944 inp->inp_fport = sin->sin_port;
945 in_pcbrehash(inp);
946 lck_rw_done(inp->inp_pcbinfo->mtx);
947 return (0);
948 }
949
950 void
951 in_pcbdisconnect(struct inpcb *inp)
952 {
953
954 inp->inp_faddr.s_addr = INADDR_ANY;
955 inp->inp_fport = 0;
956
957 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
958 /*lock inversion issue, mostly with udp multicast packets */
959 socket_unlock(inp->inp_socket, 0);
960 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
961 socket_lock(inp->inp_socket, 0);
962 }
963
964 in_pcbrehash(inp);
965 lck_rw_done(inp->inp_pcbinfo->mtx);
966
967 if (inp->inp_socket->so_state & SS_NOFDREF)
968 in_pcbdetach(inp);
969 }
970
971 void
972 in_pcbdetach(struct inpcb *inp)
973 {
974 struct socket *so = inp->inp_socket;
975
976 if (so->so_pcb == 0) { /* we've been called twice */
977 panic("in_pcbdetach: inp=%p so=%p proto=%d so_pcb is null!\n",
978 inp, so, so->so_proto->pr_protocol);
979 }
980
981 #if IPSEC
982 if (ipsec_bypass == 0) {
983 ipsec4_delete_pcbpolicy(inp);
984 }
985 #endif /*IPSEC*/
986
987 /* mark socket state as dead */
988 if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING)
989 panic("in_pcbdetach so=%p prot=%x couldn't set to STOPUSING\n", so, so->so_proto->pr_protocol);
990
991 #if TEMPDEBUG
992 if (so->cached_in_sock_layer)
993 printf("in_pcbdetach for cached socket %x flags=%x\n", so, so->so_flags);
994 else
995 printf("in_pcbdetach for allocated socket %x flags=%x\n", so, so->so_flags);
996 #endif
997 if ((so->so_flags & SOF_PCBCLEARING) == 0) {
998 struct rtentry *rt;
999 struct ip_moptions *imo;
1000
1001 inp->inp_vflag = 0;
1002 if (inp->inp_options)
1003 (void)m_free(inp->inp_options);
1004 if ((rt = inp->inp_route.ro_rt) != NULL) {
1005 inp->inp_route.ro_rt = NULL;
1006 rtfree(rt);
1007 }
1008 imo = inp->inp_moptions;
1009 inp->inp_moptions = NULL;
1010 if (imo != NULL)
1011 IMO_REMREF(imo);
1012 sofreelastref(so, 0);
1013 inp->inp_state = INPCB_STATE_DEAD;
1014 so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
1015 }
1016 }
1017
1018
1019 void
1020 in_pcbdispose(struct inpcb *inp)
1021 {
1022 struct socket *so = inp->inp_socket;
1023 struct inpcbinfo *ipi = inp->inp_pcbinfo;
1024
1025 #if TEMPDEBUG
1026 if (inp->inp_state != INPCB_STATE_DEAD) {
1027 printf("in_pcbdispose: not dead yet? so=%p\n", so);
1028 }
1029 #endif
1030 if (so && so->so_usecount != 0)
1031 panic("%s: so %p so_usecount %d so_lockhistory %s\n",
1032 __func__, so, so->so_usecount,
1033 (so != NULL) ? solockhistory_nr(so) : "--");
1034
1035 lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE);
1036
1037 inp->inp_gencnt = ++ipi->ipi_gencnt;
1038 /* access ipi in in_pcbremlists */
1039 in_pcbremlists(inp);
1040
1041 if (so) {
1042 if (so->so_proto->pr_flags & PR_PCBLOCK) {
1043 sofreelastref(so, 0);
1044 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
1045 #if TEMPDEBUG
1046 printf("in_pcbdispose sb not cleaned up so=%p rc_cci=%x snd_cc=%x\n",
1047 so, so->so_rcv.sb_cc, so->so_snd.sb_cc);
1048 #endif
1049 sbrelease(&so->so_rcv);
1050 sbrelease(&so->so_snd);
1051 }
1052 if (so->so_head != NULL)
1053 panic("in_pcbdispose, so=%p head still exist\n", so);
1054 lck_mtx_unlock(&inp->inpcb_mtx);
1055 lck_mtx_destroy(&inp->inpcb_mtx, ipi->mtx_grp);
1056 }
1057 so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
1058 so->so_saved_pcb = (caddr_t) inp;
1059 so->so_pcb = 0;
1060 inp->inp_socket = 0;
1061 #if CONFIG_MACF_NET
1062 mac_inpcb_label_destroy(inp);
1063 #endif
1064 /*
1065 * In case there a route cached after a detach (possible
1066 * in the tcp case), make sure that it is freed before
1067 * we deallocate the structure.
1068 */
1069 if (inp->inp_route.ro_rt != NULL) {
1070 rtfree(inp->inp_route.ro_rt);
1071 inp->inp_route.ro_rt = NULL;
1072 }
1073 if (so->cached_in_sock_layer == 0) {
1074 zfree(ipi->ipi_zone, inp);
1075 }
1076 sodealloc(so);
1077 }
1078 #if TEMPDEBUG
1079 else
1080 printf("in_pcbdispose: no socket for inp=%p\n", inp);
1081 #endif
1082 }
1083
1084 /*
1085 * The calling convention of in_setsockaddr() and in_setpeeraddr() was
1086 * modified to match the pru_sockaddr() and pru_peeraddr() entry points
1087 * in struct pr_usrreqs, so that protocols can just reference then directly
1088 * without the need for a wrapper function. The socket must have a valid
1089 * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
1090 * except through a kernel programming error, so it is acceptable to panic
1091 * (or in this case trap) if the PCB is invalid. (Actually, we don't trap
1092 * because there actually /is/ a programming error somewhere... XXX)
1093 *
1094 * Returns: 0 Success
1095 * ENOBUFS No buffer space available
1096 * ECONNRESET Connection reset
1097 */
1098 int
1099 in_setsockaddr(struct socket *so, struct sockaddr **nam)
1100 {
1101 struct inpcb *inp;
1102 struct sockaddr_in *sin;
1103
1104 /*
1105 * Do the malloc first in case it blocks.
1106 */
1107 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
1108 if (sin == NULL)
1109 return ENOBUFS;
1110 bzero(sin, sizeof *sin);
1111 sin->sin_family = AF_INET;
1112 sin->sin_len = sizeof(*sin);
1113
1114 inp = sotoinpcb(so);
1115 if (!inp) {
1116 FREE(sin, M_SONAME);
1117 return ECONNRESET;
1118 }
1119 sin->sin_port = inp->inp_lport;
1120 sin->sin_addr = inp->inp_laddr;
1121
1122 *nam = (struct sockaddr *)sin;
1123 return 0;
1124 }
1125
1126 int
1127 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1128 {
1129 struct inpcb *inp;
1130 struct sockaddr_in *sin;
1131
1132 /*
1133 * Do the malloc first in case it blocks.
1134 */
1135 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
1136 if (sin == NULL)
1137 return ENOBUFS;
1138 bzero((caddr_t)sin, sizeof (*sin));
1139 sin->sin_family = AF_INET;
1140 sin->sin_len = sizeof(*sin);
1141
1142 inp = sotoinpcb(so);
1143 if (!inp) {
1144 FREE(sin, M_SONAME);
1145 return ECONNRESET;
1146 }
1147 sin->sin_port = inp->inp_fport;
1148 sin->sin_addr = inp->inp_faddr;
1149
1150 *nam = (struct sockaddr *)sin;
1151 return 0;
1152 }
1153
1154 void
1155 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1156 int errno, void (*notify)(struct inpcb *, int))
1157 {
1158 struct inpcb *inp;
1159
1160 lck_rw_lock_shared(pcbinfo->mtx);
1161
1162 LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
1163 #if INET6
1164 if ((inp->inp_vflag & INP_IPV4) == 0)
1165 continue;
1166 #endif
1167 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1168 inp->inp_socket == NULL)
1169 continue;
1170 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
1171 continue;
1172 socket_lock(inp->inp_socket, 1);
1173 (*notify)(inp, errno);
1174 (void)in_pcb_checkstate(inp, WNT_RELEASE, 1);
1175 socket_unlock(inp->inp_socket, 1);
1176 }
1177 lck_rw_done(pcbinfo->mtx);
1178 }
1179
1180 /*
1181 * Check for alternatives when higher level complains
1182 * about service problems. For now, invalidate cached
1183 * routing information. If the route was created dynamically
1184 * (by a redirect), time to try a default gateway again.
1185 */
1186 void
1187 in_losing(struct inpcb *inp)
1188 {
1189 struct rtentry *rt;
1190 struct rt_addrinfo info;
1191
1192 if ((rt = inp->inp_route.ro_rt) != NULL) {
1193 struct in_ifaddr *ia;
1194
1195 bzero((caddr_t)&info, sizeof(info));
1196 RT_LOCK(rt);
1197 info.rti_info[RTAX_DST] =
1198 (struct sockaddr *)&inp->inp_route.ro_dst;
1199 info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1200 info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1201 rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
1202 if (rt->rt_flags & RTF_DYNAMIC) {
1203 /*
1204 * Prevent another thread from modifying rt_key,
1205 * rt_gateway via rt_setgate() after rt_lock is
1206 * dropped by marking the route as defunct.
1207 */
1208 rt->rt_flags |= RTF_CONDEMNED;
1209 RT_UNLOCK(rt);
1210 (void) rtrequest(RTM_DELETE, rt_key(rt),
1211 rt->rt_gateway, rt_mask(rt), rt->rt_flags,
1212 (struct rtentry **)0);
1213 } else {
1214 RT_UNLOCK(rt);
1215 }
1216 /* if the address is gone keep the old route in the pcb */
1217 if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
1218 inp->inp_route.ro_rt = NULL;
1219 rtfree(rt);
1220 IFA_REMREF(&ia->ia_ifa);
1221 }
1222 /*
1223 * A new route can be allocated
1224 * the next time output is attempted.
1225 */
1226 }
1227 }
1228
1229 /*
1230 * After a routing change, flush old routing
1231 * and allocate a (hopefully) better one.
1232 */
1233 void
1234 in_rtchange(struct inpcb *inp, __unused int errno)
1235 {
1236 struct rtentry *rt;
1237
1238 if ((rt = inp->inp_route.ro_rt) != NULL) {
1239 struct in_ifaddr *ia;
1240
1241 if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) {
1242 return; /* we can't remove the route now. not sure if still ok to use src */
1243 }
1244 IFA_REMREF(&ia->ia_ifa);
1245 rtfree(rt);
1246 inp->inp_route.ro_rt = NULL;
1247 /*
1248 * A new route can be allocated the next time
1249 * output is attempted.
1250 */
1251 }
1252 }
1253
1254 /*
1255 * Lookup a PCB based on the local address and port.
1256 */
1257 struct inpcb *
1258 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1259 unsigned int lport_arg, int wild_okay)
1260 {
1261 struct inpcb *inp;
1262 int matchwild = 3, wildcard;
1263 u_short lport = lport_arg;
1264
1265 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0,0,0,0,0);
1266
1267 if (!wild_okay) {
1268 struct inpcbhead *head;
1269 /*
1270 * Look for an unconnected (wildcard foreign addr) PCB that
1271 * matches the local address and port we're looking for.
1272 */
1273 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1274 LIST_FOREACH(inp, head, inp_hash) {
1275 #if INET6
1276 if ((inp->inp_vflag & INP_IPV4) == 0)
1277 continue;
1278 #endif
1279 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1280 inp->inp_laddr.s_addr == laddr.s_addr &&
1281 inp->inp_lport == lport) {
1282 /*
1283 * Found.
1284 */
1285 return (inp);
1286 }
1287 }
1288 /*
1289 * Not found.
1290 */
1291 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0,0,0,0,0);
1292 return (NULL);
1293 } else {
1294 struct inpcbporthead *porthash;
1295 struct inpcbport *phd;
1296 struct inpcb *match = NULL;
1297 /*
1298 * Best fit PCB lookup.
1299 *
1300 * First see if this local port is in use by looking on the
1301 * port hash list.
1302 */
1303 porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
1304 pcbinfo->porthashmask)];
1305 LIST_FOREACH(phd, porthash, phd_hash) {
1306 if (phd->phd_port == lport)
1307 break;
1308 }
1309 if (phd != NULL) {
1310 /*
1311 * Port is in use by one or more PCBs. Look for best
1312 * fit.
1313 */
1314 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1315 wildcard = 0;
1316 #if INET6
1317 if ((inp->inp_vflag & INP_IPV4) == 0)
1318 continue;
1319 #endif
1320 if (inp->inp_faddr.s_addr != INADDR_ANY)
1321 wildcard++;
1322 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1323 if (laddr.s_addr == INADDR_ANY)
1324 wildcard++;
1325 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1326 continue;
1327 } else {
1328 if (laddr.s_addr != INADDR_ANY)
1329 wildcard++;
1330 }
1331 if (wildcard < matchwild) {
1332 match = inp;
1333 matchwild = wildcard;
1334 if (matchwild == 0) {
1335 break;
1336 }
1337 }
1338 }
1339 }
1340 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,0,0,0,0);
1341 return (match);
1342 }
1343 }
1344
1345 /*
1346 * Check if PCB exists in hash list.
1347 */
1348 int
1349 in_pcblookup_hash_exists(
1350 struct inpcbinfo *pcbinfo,
1351 struct in_addr faddr,
1352 u_int fport_arg,
1353 struct in_addr laddr,
1354 u_int lport_arg,
1355 int wildcard,
1356 uid_t *uid,
1357 gid_t *gid,
1358 struct ifnet *ifp)
1359 {
1360 struct inpcbhead *head;
1361 struct inpcb *inp;
1362 u_short fport = fport_arg, lport = lport_arg;
1363 int found;
1364
1365 *uid = UID_MAX;
1366 *gid = GID_MAX;
1367
1368 /*
1369 * We may have found the pcb in the last lookup - check this first.
1370 */
1371
1372 lck_rw_lock_shared(pcbinfo->mtx);
1373
1374 /*
1375 * First look for an exact match.
1376 */
1377 head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1378 pcbinfo->hashmask)];
1379 LIST_FOREACH(inp, head, inp_hash) {
1380 #if INET6
1381 if ((inp->inp_vflag & INP_IPV4) == 0)
1382 continue;
1383 #endif
1384 if (ip_restrictrecvif && ifp != NULL &&
1385 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1386 !(inp->inp_flags & INP_RECV_ANYIF))
1387 continue;
1388
1389 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1390 inp->inp_laddr.s_addr == laddr.s_addr &&
1391 inp->inp_fport == fport &&
1392 inp->inp_lport == lport) {
1393 if ((found = (inp->inp_socket != NULL))) {
1394 /*
1395 * Found.
1396 */
1397 *uid = kauth_cred_getuid(
1398 inp->inp_socket->so_cred);
1399 *gid = kauth_cred_getgid(
1400 inp->inp_socket->so_cred);
1401 }
1402 lck_rw_done(pcbinfo->mtx);
1403 return (found);
1404 }
1405 }
1406 if (wildcard) {
1407 struct inpcb *local_wild = NULL;
1408 #if INET6
1409 struct inpcb *local_wild_mapped = NULL;
1410 #endif
1411
1412 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
1413 pcbinfo->hashmask)];
1414 LIST_FOREACH(inp, head, inp_hash) {
1415 #if INET6
1416 if ((inp->inp_vflag & INP_IPV4) == 0)
1417 continue;
1418 #endif
1419 if (ip_restrictrecvif && ifp != NULL &&
1420 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1421 !(inp->inp_flags & INP_RECV_ANYIF))
1422 continue;
1423
1424 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1425 inp->inp_lport == lport) {
1426 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1427 if ((found = (inp->inp_socket != NULL))) {
1428 *uid = kauth_cred_getuid(
1429 inp->inp_socket->so_cred);
1430 *gid = kauth_cred_getgid(
1431 inp->inp_socket->so_cred);
1432 }
1433 lck_rw_done(pcbinfo->mtx);
1434 return (found);
1435 }
1436 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1437 #if INET6
1438 if (inp->inp_socket &&
1439 INP_CHECK_SOCKAF(inp->inp_socket,
1440 AF_INET6))
1441 local_wild_mapped = inp;
1442 else
1443 #endif /* INET6 */
1444 local_wild = inp;
1445 }
1446 }
1447 }
1448 if (local_wild == NULL) {
1449 #if INET6
1450 if (local_wild_mapped != NULL) {
1451 if ((found = (local_wild_mapped->inp_socket != NULL))) {
1452 *uid = kauth_cred_getuid(
1453 local_wild_mapped->inp_socket->so_cred);
1454 *gid = kauth_cred_getgid(
1455 local_wild_mapped->inp_socket->so_cred);
1456 }
1457 lck_rw_done(pcbinfo->mtx);
1458 return (found);
1459 }
1460 #endif /* INET6 */
1461 lck_rw_done(pcbinfo->mtx);
1462 return (0);
1463 }
1464 if (local_wild != NULL) {
1465 if ((found = (local_wild->inp_socket != NULL))) {
1466 *uid = kauth_cred_getuid(
1467 local_wild->inp_socket->so_cred);
1468 *gid = kauth_cred_getgid(
1469 local_wild->inp_socket->so_cred);
1470 }
1471 lck_rw_done(pcbinfo->mtx);
1472 return (found);
1473 }
1474 }
1475
1476 /*
1477 * Not found.
1478 */
1479 lck_rw_done(pcbinfo->mtx);
1480 return (0);
1481 }
1482
1483 /*
1484 * Lookup PCB in hash list.
1485 */
1486 struct inpcb *
1487 in_pcblookup_hash(
1488 struct inpcbinfo *pcbinfo,
1489 struct in_addr faddr,
1490 u_int fport_arg,
1491 struct in_addr laddr,
1492 u_int lport_arg,
1493 int wildcard,
1494 struct ifnet *ifp)
1495 {
1496 struct inpcbhead *head;
1497 struct inpcb *inp;
1498 u_short fport = fport_arg, lport = lport_arg;
1499
1500 /*
1501 * We may have found the pcb in the last lookup - check this first.
1502 */
1503
1504 lck_rw_lock_shared(pcbinfo->mtx);
1505
1506 /*
1507 * First look for an exact match.
1508 */
1509 head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
1510 LIST_FOREACH(inp, head, inp_hash) {
1511 #if INET6
1512 if ((inp->inp_vflag & INP_IPV4) == 0)
1513 continue;
1514 #endif
1515 if (ip_restrictrecvif && ifp != NULL &&
1516 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1517 !(inp->inp_flags & INP_RECV_ANYIF))
1518 continue;
1519
1520 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1521 inp->inp_laddr.s_addr == laddr.s_addr &&
1522 inp->inp_fport == fport &&
1523 inp->inp_lport == lport) {
1524 /*
1525 * Found.
1526 */
1527 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1528 lck_rw_done(pcbinfo->mtx);
1529 return (inp);
1530 }
1531 else { /* it's there but dead, say it isn't found */
1532 lck_rw_done(pcbinfo->mtx);
1533 return (NULL);
1534 }
1535 }
1536 }
1537 if (wildcard) {
1538 struct inpcb *local_wild = NULL;
1539 #if INET6
1540 struct inpcb *local_wild_mapped = NULL;
1541 #endif
1542
1543 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1544 LIST_FOREACH(inp, head, inp_hash) {
1545 #if INET6
1546 if ((inp->inp_vflag & INP_IPV4) == 0)
1547 continue;
1548 #endif
1549 if (ip_restrictrecvif && ifp != NULL &&
1550 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1551 !(inp->inp_flags & INP_RECV_ANYIF))
1552 continue;
1553
1554 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1555 inp->inp_lport == lport) {
1556 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1557 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1558 lck_rw_done(pcbinfo->mtx);
1559 return (inp);
1560 }
1561 else { /* it's there but dead, say it isn't found */
1562 lck_rw_done(pcbinfo->mtx);
1563 return (NULL);
1564 }
1565 }
1566 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1567 #if INET6
1568 if (INP_CHECK_SOCKAF(inp->inp_socket,
1569 AF_INET6))
1570 local_wild_mapped = inp;
1571 else
1572 #endif /* INET6 */
1573 local_wild = inp;
1574 }
1575 }
1576 }
1577 if (local_wild == NULL) {
1578 #if INET6
1579 if (local_wild_mapped != NULL) {
1580 if (in_pcb_checkstate(local_wild_mapped, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1581 lck_rw_done(pcbinfo->mtx);
1582 return (local_wild_mapped);
1583 }
1584 else { /* it's there but dead, say it isn't found */
1585 lck_rw_done(pcbinfo->mtx);
1586 return (NULL);
1587 }
1588 }
1589 #endif /* INET6 */
1590 lck_rw_done(pcbinfo->mtx);
1591 return (NULL);
1592 }
1593 if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1594 lck_rw_done(pcbinfo->mtx);
1595 return (local_wild);
1596 }
1597 else { /* it's there but dead, say it isn't found */
1598 lck_rw_done(pcbinfo->mtx);
1599 return (NULL);
1600 }
1601 }
1602
1603 /*
1604 * Not found.
1605 */
1606 lck_rw_done(pcbinfo->mtx);
1607 return (NULL);
1608 }
1609
1610 /*
1611 * Insert PCB onto various hash lists.
1612 */
1613 int
1614 in_pcbinshash(struct inpcb *inp, int locked)
1615 {
1616 struct inpcbhead *pcbhash;
1617 struct inpcbporthead *pcbporthash;
1618 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1619 struct inpcbport *phd;
1620 u_int32_t hashkey_faddr;
1621
1622 if (!locked) {
1623 if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
1624 /*lock inversion issue, mostly with udp multicast packets */
1625 socket_unlock(inp->inp_socket, 0);
1626 lck_rw_lock_exclusive(pcbinfo->mtx);
1627 socket_lock(inp->inp_socket, 0);
1628 if (inp->inp_state == INPCB_STATE_DEAD) {
1629 /* The socket got dropped when it was unlocked */
1630 lck_rw_done(pcbinfo->mtx);
1631 return(ECONNABORTED);
1632 }
1633 }
1634 }
1635
1636 #if INET6
1637 if (inp->inp_vflag & INP_IPV6)
1638 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1639 else
1640 #endif /* INET6 */
1641 hashkey_faddr = inp->inp_faddr.s_addr;
1642
1643 inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->hashmask);
1644
1645 pcbhash = &pcbinfo->hashbase[inp->hash_element];
1646
1647 pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1648 pcbinfo->porthashmask)];
1649
1650 /*
1651 * Go through port list and look for a head for this lport.
1652 */
1653 LIST_FOREACH(phd, pcbporthash, phd_hash) {
1654 if (phd->phd_port == inp->inp_lport)
1655 break;
1656 }
1657
1658 VERIFY(inp->inp_state != INPCB_STATE_DEAD);
1659
1660 /*
1661 * If none exists, malloc one and tack it on.
1662 */
1663 if (phd == NULL) {
1664 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_WAITOK);
1665 if (phd == NULL) {
1666 if (!locked)
1667 lck_rw_done(pcbinfo->mtx);
1668 return (ENOBUFS); /* XXX */
1669 }
1670 phd->phd_port = inp->inp_lport;
1671 LIST_INIT(&phd->phd_pcblist);
1672 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1673 }
1674 inp->inp_phd = phd;
1675 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1676 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1677 if (!locked)
1678 lck_rw_done(pcbinfo->mtx);
1679 return (0);
1680 }
1681
1682 /*
1683 * Move PCB to the proper hash bucket when { faddr, fport } have been
1684 * changed. NOTE: This does not handle the case of the lport changing (the
1685 * hashed port list would have to be updated as well), so the lport must
1686 * not change after in_pcbinshash() has been called.
1687 */
1688 void
1689 in_pcbrehash(struct inpcb *inp)
1690 {
1691 struct inpcbhead *head;
1692 u_int32_t hashkey_faddr;
1693
1694 #if INET6
1695 if (inp->inp_vflag & INP_IPV6)
1696 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1697 else
1698 #endif /* INET6 */
1699 hashkey_faddr = inp->inp_faddr.s_addr;
1700 inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
1701 inp->inp_fport, inp->inp_pcbinfo->hashmask);
1702 head = &inp->inp_pcbinfo->hashbase[inp->hash_element];
1703
1704 LIST_REMOVE(inp, inp_hash);
1705 LIST_INSERT_HEAD(head, inp, inp_hash);
1706 }
1707
1708 /*
1709 * Remove PCB from various lists.
1710 * Must be called pcbinfo lock is held in exclusive mode.
1711 */
1712 void
1713 in_pcbremlists(struct inpcb *inp)
1714 {
1715 inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
1716
1717 if (inp->inp_lport) {
1718 struct inpcbport *phd = inp->inp_phd;
1719
1720 LIST_REMOVE(inp, inp_hash);
1721 LIST_REMOVE(inp, inp_portlist);
1722 if (phd != NULL && (LIST_FIRST(&phd->phd_pcblist) == NULL)) {
1723 LIST_REMOVE(phd, phd_hash);
1724 FREE(phd, M_PCB);
1725 }
1726 }
1727 LIST_REMOVE(inp, inp_list);
1728
1729 if (inp->inp_flags2 & INP2_IN_FCTREE) {
1730 inp_fc_getinp(inp->inp_flowhash,
1731 (INPFC_SOLOCKED|INPFC_REMOVE));
1732 VERIFY(!(inp->inp_flags2 & INP2_IN_FCTREE));
1733 }
1734 inp->inp_pcbinfo->ipi_count--;
1735 }
1736
1737 /* Mechanism used to defer the memory release of PCBs
1738 * The pcb list will contain the pcb until the ripper can clean it up if
1739 * the following conditions are met: 1) state "DEAD", 2) wantcnt is STOPUSING
1740 * 3) usecount is null
1741 * This function will be called to either mark the pcb as
1742 */
1743 int
1744 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
1745 {
1746
1747 volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
1748 UInt32 origwant;
1749 UInt32 newwant;
1750
1751 switch (mode) {
1752
1753 case WNT_STOPUSING: /* try to mark the pcb as ready for recycling */
1754
1755 /* compareswap with STOPUSING, if success we're good, if it's in use, will be marked later */
1756
1757 if (locked == 0)
1758 socket_lock(pcb->inp_socket, 1);
1759 pcb->inp_state = INPCB_STATE_DEAD;
1760
1761 stopusing:
1762 if (pcb->inp_socket->so_usecount < 0)
1763 panic("in_pcb_checkstate STOP pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1764 if (locked == 0)
1765 socket_unlock(pcb->inp_socket, 1);
1766
1767 origwant = *wantcnt;
1768 if ((UInt16) origwant == 0xffff ) /* should stop using */
1769 return (WNT_STOPUSING);
1770 newwant = 0xffff;
1771 if ((UInt16) origwant == 0) {/* try to mark it as unsuable now */
1772 OSCompareAndSwap(origwant, newwant, wantcnt) ;
1773 }
1774 return (WNT_STOPUSING);
1775 break;
1776
1777 case WNT_ACQUIRE: /* try to increase reference to pcb */
1778 /* if WNT_STOPUSING should bail out */
1779 /*
1780 * if socket state DEAD, try to set count to STOPUSING, return failed
1781 * otherwise increase cnt
1782 */
1783 do {
1784 origwant = *wantcnt;
1785 if ((UInt16) origwant == 0xffff ) {/* should stop using */
1786 // printf("in_pcb_checkstate: ACQ PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1787 return (WNT_STOPUSING);
1788 }
1789 newwant = origwant + 1;
1790 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1791 return (WNT_ACQUIRE);
1792 break;
1793
1794 case WNT_RELEASE: /* release reference. if result is null and pcb state is DEAD,
1795 set wanted bit to STOPUSING
1796 */
1797
1798 if (locked == 0)
1799 socket_lock(pcb->inp_socket, 1);
1800
1801 do {
1802 origwant = *wantcnt;
1803 if ((UInt16) origwant == 0x0 )
1804 panic("in_pcb_checkstate pcb=%p release with zero count", pcb);
1805 if ((UInt16) origwant == 0xffff ) {/* should stop using */
1806 #if TEMPDEBUG
1807 printf("in_pcb_checkstate: REL PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1808 #endif
1809 if (locked == 0)
1810 socket_unlock(pcb->inp_socket, 1);
1811 return (WNT_STOPUSING);
1812 }
1813 newwant = origwant - 1;
1814 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1815
1816 if (pcb->inp_state == INPCB_STATE_DEAD)
1817 goto stopusing;
1818 if (pcb->inp_socket->so_usecount < 0)
1819 panic("in_pcb_checkstate RELEASE pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1820
1821 if (locked == 0)
1822 socket_unlock(pcb->inp_socket, 1);
1823 return (WNT_RELEASE);
1824 break;
1825
1826 default:
1827
1828 panic("in_pcb_checkstate: so=%p not a valid state =%x\n", pcb->inp_socket, mode);
1829 }
1830
1831 /* NOTREACHED */
1832 return (mode);
1833 }
1834
1835 /*
1836 * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
1837 * The inpcb_compat data structure is passed to user space and must
1838 * not change. We intentionally avoid copying pointers.
1839 */
1840 void
1841 inpcb_to_compat(
1842 struct inpcb *inp,
1843 struct inpcb_compat *inp_compat)
1844 {
1845 bzero(inp_compat, sizeof(*inp_compat));
1846 inp_compat->inp_fport = inp->inp_fport;
1847 inp_compat->inp_lport = inp->inp_lport;
1848 inp_compat->nat_owner = 0;
1849 inp_compat->nat_cookie = inp->nat_cookie;
1850 inp_compat->inp_gencnt = inp->inp_gencnt;
1851 inp_compat->inp_flags = inp->inp_flags;
1852 inp_compat->inp_flow = inp->inp_flow;
1853 inp_compat->inp_vflag = inp->inp_vflag;
1854 inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
1855 inp_compat->inp_ip_p = inp->inp_ip_p;
1856 inp_compat->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1857 inp_compat->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1858 inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1859 inp_compat->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1860 inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1861 inp_compat->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1862 inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1863 }
1864
1865 #if !CONFIG_EMBEDDED
1866
1867 void
1868 inpcb_to_xinpcb64(
1869 struct inpcb *inp,
1870 struct xinpcb64 *xinp)
1871 {
1872 xinp->inp_fport = inp->inp_fport;
1873 xinp->inp_lport = inp->inp_lport;
1874 xinp->inp_gencnt = inp->inp_gencnt;
1875 xinp->inp_flags = inp->inp_flags;
1876 xinp->inp_flow = inp->inp_flow;
1877 xinp->inp_vflag = inp->inp_vflag;
1878 xinp->inp_ip_ttl = inp->inp_ip_ttl;
1879 xinp->inp_ip_p = inp->inp_ip_p;
1880 xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1881 xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1882 xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1883 xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1884 xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1885 xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1886 xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1887 }
1888
1889 #endif /* !CONFIG_EMBEDDED */
1890
1891
1892 /*
1893 * The following routines implement this scheme:
1894 *
1895 * Callers of ip_output() that intend to cache the route in the inpcb pass
1896 * a local copy of the struct route to ip_output(). Using a local copy of
1897 * the cached route significantly simplifies things as IP no longer has to
1898 * worry about having exclusive access to the passed in struct route, since
1899 * it's defined in the caller's stack; in essence, this allows for a lock-
1900 * less operation when updating the struct route at the IP level and below,
1901 * whenever necessary. The scheme works as follows:
1902 *
1903 * Prior to dropping the socket's lock and calling ip_output(), the caller
1904 * copies the struct route from the inpcb into its stack, and adds a reference
1905 * to the cached route entry, if there was any. The socket's lock is then
1906 * dropped and ip_output() is called with a pointer to the copy of struct
1907 * route defined on the stack (not to the one in the inpcb.)
1908 *
1909 * Upon returning from ip_output(), the caller then acquires the socket's
1910 * lock and synchronizes the cache; if there is no route cached in the inpcb,
1911 * it copies the local copy of struct route (which may or may not contain any
1912 * route) back into the cache; otherwise, if the inpcb has a route cached in
1913 * it, the one in the local copy will be freed, if there's any. Trashing the
1914 * cached route in the inpcb can be avoided because ip_output() is single-
1915 * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
1916 * by the socket/transport layer.)
1917 */
1918 void
1919 inp_route_copyout(struct inpcb *inp, struct route *dst)
1920 {
1921 struct route *src = &inp->inp_route;
1922
1923 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1924
1925 /*
1926 * If the route in the PCB is not for IPv4, blow it away;
1927 * this is possible in the case of IPv4-mapped address case.
1928 */
1929 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
1930 rtfree(src->ro_rt);
1931 src->ro_rt = NULL;
1932 }
1933
1934 route_copyout(dst, src, sizeof(*dst));
1935 }
1936
1937 void
1938 inp_route_copyin(struct inpcb *inp, struct route *src)
1939 {
1940 struct route *dst = &inp->inp_route;
1941
1942 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1943
1944 /* Minor sanity check */
1945 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
1946 panic("%s: wrong or corrupted route: %p", __func__, src);
1947
1948 route_copyin(src, dst, sizeof(*src));
1949 }
1950
1951 /*
1952 * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option.
1953 */
1954 int
1955 inp_bindif(struct inpcb *inp, unsigned int ifscope)
1956 {
1957 struct ifnet *ifp = NULL;
1958
1959 ifnet_head_lock_shared();
1960 if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
1961 (ifp = ifindex2ifnet[ifscope]) == NULL)) {
1962 ifnet_head_done();
1963 return (ENXIO);
1964 }
1965 ifnet_head_done();
1966
1967 VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
1968
1969 /*
1970 * A zero interface scope value indicates an "unbind".
1971 * Otherwise, take in whatever value the app desires;
1972 * the app may already know the scope (or force itself
1973 * to such a scope) ahead of time before the interface
1974 * gets attached. It doesn't matter either way; any
1975 * route lookup from this point on will require an
1976 * exact match for the embedded interface scope.
1977 */
1978 inp->inp_boundifp = ifp;
1979 if (inp->inp_boundifp == NULL)
1980 inp->inp_flags &= ~INP_BOUND_IF;
1981 else
1982 inp->inp_flags |= INP_BOUND_IF;
1983
1984 /* Blow away any cached route in the PCB */
1985 if (inp->inp_route.ro_rt != NULL) {
1986 rtfree(inp->inp_route.ro_rt);
1987 inp->inp_route.ro_rt = NULL;
1988 }
1989
1990 return (0);
1991 }
1992
1993 /*
1994 * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option.
1995 */
1996 int
1997 inp_nocellular(struct inpcb *inp, unsigned int val)
1998 {
1999 if (val) {
2000 inp->inp_flags |= INP_NO_IFT_CELLULAR;
2001 } else if (inp->inp_flags & INP_NO_IFT_CELLULAR) {
2002 /* once set, it cannot be unset */
2003 return (EINVAL);
2004 }
2005
2006 /* Blow away any cached route in the PCB */
2007 if (inp->inp_route.ro_rt != NULL) {
2008 rtfree(inp->inp_route.ro_rt);
2009 inp->inp_route.ro_rt = NULL;
2010 }
2011
2012 return (0);
2013 }
2014
2015 /*
2016 * Calculate flow hash for an inp, used by an interface to identify a
2017 * flow. When an interface provides flow control advisory, this flow
2018 * hash is used as an identifier.
2019 */
2020 u_int32_t
2021 inp_calc_flowhash(struct inpcb *inp)
2022 {
2023 struct inp_flowhash_key fh __attribute__((aligned(8)));
2024 u_int32_t flowhash = 0;
2025 struct inpcb *tmp_inp = NULL;
2026
2027 if (inp_hash_seed == 0)
2028 inp_hash_seed = RandomULong();
2029
2030 bzero(&fh, sizeof (fh));
2031
2032 bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof (fh.infh_laddr));
2033 bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof (fh.infh_faddr));
2034
2035 fh.infh_lport = inp->inp_lport;
2036 fh.infh_fport = inp->inp_fport;
2037 fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
2038 fh.infh_proto = inp->inp_ip_p;
2039 fh.infh_rand1 = RandomULong();
2040 fh.infh_rand2 = RandomULong();
2041
2042 try_again:
2043 flowhash = net_flowhash(&fh, sizeof (fh), inp_hash_seed);
2044 if (flowhash == 0) {
2045 /* try to get a non-zero flowhash */
2046 inp_hash_seed = RandomULong();
2047 goto try_again;
2048 }
2049
2050 inp->inp_flowhash = flowhash;
2051
2052 /* Insert the inp into inp_fc_tree */
2053
2054 lck_mtx_lock(&inp_fc_lck);
2055 tmp_inp = RB_FIND(inp_fc_tree, &inp_fc_tree, inp);
2056 if (tmp_inp != NULL) {
2057 /*
2058 * There is a different inp with the same flowhash.
2059 * There can be a collision on flow hash but the
2060 * probability is low. Let's recompute the
2061 * flowhash.
2062 */
2063 lck_mtx_unlock(&inp_fc_lck);
2064 /* recompute hash seed */
2065 inp_hash_seed = RandomULong();
2066 goto try_again;
2067 }
2068 RB_INSERT(inp_fc_tree, &inp_fc_tree, inp);
2069 inp->inp_flags2 |= INP2_IN_FCTREE;
2070 lck_mtx_unlock(&inp_fc_lck);
2071
2072 return flowhash;
2073 }
2074
2075 /*
2076 * Function to compare inp_fc_entries in inp flow control tree
2077 */
2078 static inline int
2079 infc_cmp(const struct inpcb *inp1, const struct inpcb *inp2)
2080 {
2081 return (memcmp(&(inp1->inp_flowhash), &(inp2->inp_flowhash),
2082 sizeof(inp1->inp_flowhash)));
2083 }
2084
2085 struct inpcb *
2086 inp_fc_getinp(u_int32_t flowhash, u_int32_t flags)
2087 {
2088 struct inpcb *inp = NULL;
2089 int locked = (flags & INPFC_SOLOCKED) ? 1 : 0;
2090
2091 lck_mtx_lock_spin(&inp_fc_lck);
2092 key_inp.inp_flowhash = flowhash;
2093 inp = RB_FIND(inp_fc_tree, &inp_fc_tree, &key_inp);
2094 if (inp == NULL) {
2095 /* inp is not present, return */
2096 lck_mtx_unlock(&inp_fc_lck);
2097 return (NULL);
2098 }
2099
2100 if (flags & INPFC_REMOVE) {
2101 RB_REMOVE(inp_fc_tree, &inp_fc_tree, inp);
2102 lck_mtx_unlock(&inp_fc_lck);
2103
2104 bzero(&(inp->infc_link), sizeof (inp->infc_link));
2105 inp->inp_flags2 &= ~INP2_IN_FCTREE;
2106 return (NULL);
2107 }
2108 if (in_pcb_checkstate(inp, WNT_ACQUIRE, locked) == WNT_STOPUSING)
2109 inp = NULL;
2110 lck_mtx_unlock(&inp_fc_lck);
2111
2112 return (inp);
2113 }
2114
2115 void
2116 inp_fc_feedback(struct inpcb *inp)
2117 {
2118 struct socket *so = inp->inp_socket;
2119
2120 /* we already hold a want_cnt on this inp, socket can't be null */
2121 VERIFY (so != NULL);
2122 socket_lock(so, 1);
2123
2124 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2125 socket_unlock(so, 1);
2126 return;
2127 }
2128
2129 /*
2130 * Return if the connection is not in flow-controlled state.
2131 * This can happen if the connection experienced
2132 * loss while it was in flow controlled state
2133 */
2134 if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
2135 socket_unlock(so, 1);
2136 return;
2137 }
2138 inp_reset_fc_state(inp);
2139
2140 if (so->so_proto->pr_type == SOCK_STREAM)
2141 inp_fc_unthrottle_tcp(inp);
2142
2143 socket_unlock(so, 1);
2144 }
2145
2146 void
2147 inp_reset_fc_state(struct inpcb *inp)
2148 {
2149 struct socket *so = inp->inp_socket;
2150 int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
2151 int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
2152
2153 inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
2154
2155 if (suspended) {
2156 so->so_flags &= ~(SOF_SUSPENDED);
2157 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
2158 }
2159
2160 if (inp->inp_sndinprog_cnt > 0)
2161 inp->inp_flags |= INP_FC_FEEDBACK;
2162
2163 /* Give a write wakeup to unblock the socket */
2164 if (needwakeup)
2165 sowwakeup(so);
2166 }
2167
2168 int
2169 inp_set_fc_state(struct inpcb *inp, int advcode)
2170 {
2171 struct inpcb *tmp_inp = NULL;
2172 /*
2173 * If there was a feedback from the interface when
2174 * send operation was in progress, we should ignore
2175 * this flow advisory to avoid a race between setting
2176 * flow controlled state and receiving feedback from
2177 * the interface
2178 */
2179 if (inp->inp_flags & INP_FC_FEEDBACK)
2180 return(0);
2181
2182 inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
2183 if ((tmp_inp = inp_fc_getinp(inp->inp_flowhash, INPFC_SOLOCKED))
2184 != NULL) {
2185 if (in_pcb_checkstate(tmp_inp, WNT_RELEASE, 1)
2186 == WNT_STOPUSING)
2187 return (0);
2188 VERIFY(tmp_inp == inp);
2189 switch (advcode) {
2190 case FADV_FLOW_CONTROLLED:
2191 inp->inp_flags |= INP_FLOW_CONTROLLED;
2192 break;
2193 case FADV_SUSPENDED:
2194 inp->inp_flags |= INP_FLOW_SUSPENDED;
2195 soevent(inp->inp_socket,
2196 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
2197
2198 /* Record the fact that suspend event was sent */
2199 inp->inp_socket->so_flags |= SOF_SUSPENDED;
2200 break;
2201 }
2202 return (1);
2203 }
2204 return(0);
2205 }
2206
2207 /*
2208 * Handler for SO_FLUSH socket option.
2209 */
2210 int
2211 inp_flush(struct inpcb *inp, int optval)
2212 {
2213 u_int32_t flowhash = inp->inp_flowhash;
2214 struct rtentry *rt;
2215
2216 /* Either all classes or one of the valid ones */
2217 if (optval != SO_TC_ALL && !SO_VALID_TC(optval))
2218 return (EINVAL);
2219
2220 /* We need a flow hash for identification */
2221 if (flowhash == 0)
2222 return (0);
2223
2224 /* We need a cached route for the interface */
2225 if ((rt = inp->inp_route.ro_rt) != NULL) {
2226 struct ifnet *ifp = rt->rt_ifp;
2227 if_qflush_sc(ifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
2228 }
2229
2230 return (0);
2231 }
2232
2233 /*
2234 * Clear the INP_INADDR_ANY flag (special case for PPP only)
2235 */
2236 void inp_clear_INP_INADDR_ANY(struct socket *so)
2237 {
2238 struct inpcb *inp = NULL;
2239
2240 socket_lock(so, 1);
2241 inp = sotoinpcb(so);
2242 if (inp) {
2243 inp->inp_flags &= ~INP_INADDR_ANY;
2244 }
2245 socket_unlock(so, 1);
2246 }
2247