]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/in_pcb.c
xnu-2050.24.15.tar.gz
[apple/xnu.git] / bsd / netinet / in_pcb.c
1 /*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1991, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62 */
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #ifndef __APPLE__
74 #include <sys/jail.h>
75 #endif
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <sys/mcache.h>
79 #include <sys/kauth.h>
80 #include <sys/priv.h>
81 #include <libkern/OSAtomic.h>
82 #include <kern/locks.h>
83
84 #include <machine/limits.h>
85
86 #ifdef __APPLE__
87 #include <kern/zalloc.h>
88 #endif
89
90 #include <net/if.h>
91 #include <net/if_types.h>
92 #include <net/route.h>
93 #include <net/flowhash.h>
94 #include <net/flowadv.h>
95
96 #include <netinet/in.h>
97 #include <netinet/in_pcb.h>
98 #include <netinet/in_var.h>
99 #include <netinet/ip_var.h>
100 #if INET6
101 #include <netinet/ip6.h>
102 #include <netinet6/ip6_var.h>
103 #endif /* INET6 */
104
105 #if IPSEC
106 #include <netinet6/ipsec.h>
107 #include <netkey/key.h>
108 #endif /* IPSEC */
109
110 #include <sys/kdebug.h>
111 #include <sys/random.h>
112 #include <dev/random/randomdev.h>
113
114 #if IPSEC
115 extern int ipsec_bypass;
116 #endif
117
118 #define DBG_FNC_PCB_LOOKUP NETDBG_CODE(DBG_NETTCP, (6 << 8))
119 #define DBG_FNC_PCB_HLOOKUP NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
120
121 struct in_addr zeroin_addr;
122
123 /*
124 * These configure the range of local port addresses assigned to
125 * "unspecified" outgoing connections/packets/whatever.
126 */
127 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
128 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
129 #ifndef __APPLE__
130 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */
131 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */
132 #else
133 int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
134 int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
135 #endif
136 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
137 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
138
139 #define RANGECHK(var, min, max) \
140 if ((var) < (min)) { (var) = (min); } \
141 else if ((var) > (max)) { (var) = (max); }
142
143 static int
144 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
145 {
146 #pragma unused(arg1, arg2)
147 int error = sysctl_handle_int(oidp,
148 oidp->oid_arg1, oidp->oid_arg2, req);
149 if (!error) {
150 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
151 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
152 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
153 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
154 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
155 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
156 }
157 return error;
158 }
159
160 #undef RANGECHK
161
162 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IP Ports");
163
164 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
165 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
166 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
167 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
168 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
169 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
171 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
172 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
173 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_LOCKED,
175 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
176
177 extern int udp_use_randomport;
178 extern int tcp_use_randomport;
179
180 /* Structs used for flowhash computation */
181 struct inp_flowhash_key_addr {
182 union {
183 struct in_addr v4;
184 struct in6_addr v6;
185 u_int8_t addr8[16];
186 u_int16_t addr16[8];
187 u_int32_t addr32[4];
188 } infha;
189 };
190
191 struct inp_flowhash_key {
192 struct inp_flowhash_key_addr infh_laddr;
193 struct inp_flowhash_key_addr infh_faddr;
194 u_int32_t infh_lport;
195 u_int32_t infh_fport;
196 u_int32_t infh_af;
197 u_int32_t infh_proto;
198 u_int32_t infh_rand1;
199 u_int32_t infh_rand2;
200 };
201
202 u_int32_t inp_hash_seed = 0;
203
204 static __inline int infc_cmp(const struct inp_fc_entry *,
205 const struct inp_fc_entry *);
206 lck_grp_t *inp_lck_grp;
207 lck_grp_attr_t *inp_lck_grp_attr;
208 lck_attr_t *inp_lck_attr;
209 decl_lck_mtx_data(, inp_fc_lck);
210
211 RB_HEAD(inp_fc_tree, inp_fc_entry) inp_fc_tree;
212 RB_PROTOTYPE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp);
213
214 RB_GENERATE(inp_fc_tree, inp_fc_entry, infc_link, infc_cmp);
215
216 static unsigned int inp_fcezone_size;
217 static struct zone *inp_fcezone;
218 #define INP_FCEZONE_NAME "inp_fcezone"
219 #define INP_FCEZONE_MAX 32
220
221 /*
222 * in_pcb.c: manage the Protocol Control Blocks.
223 */
224
225 /*
226 * Initialize data structures required to deliver
227 * flow advisories.
228 */
229 void
230 socket_flowadv_init(void)
231 {
232 inp_lck_grp_attr = lck_grp_attr_alloc_init();
233 inp_lck_grp = lck_grp_alloc_init("inp_lck_grp", inp_lck_grp_attr);
234
235 inp_lck_attr = lck_attr_alloc_init();
236 lck_mtx_init(&inp_fc_lck, inp_lck_grp, inp_lck_attr);
237
238 RB_INIT(&inp_fc_tree);
239
240 inp_fcezone_size = P2ROUNDUP(sizeof (struct inp_fc_entry),
241 sizeof (u_int64_t));
242 inp_fcezone = zinit(inp_fcezone_size,
243 INP_FCEZONE_MAX * inp_fcezone_size, 0, INP_FCEZONE_NAME);
244 if (inp_fcezone == NULL) {
245 panic("%s: failed allocating %s", __func__,
246 INP_FCEZONE_NAME);
247 /* NOTREACHED */
248 }
249 zone_change(inp_fcezone, Z_EXPAND, TRUE);
250 zone_change(inp_fcezone, Z_CALLERACCT, FALSE);
251 }
252
253 /*
254 * Allocate a PCB and associate it with the socket.
255 *
256 * Returns: 0 Success
257 * ENOBUFS
258 * ENOMEM
259 * ipsec_init_policy:??? [IPSEC]
260 */
261 int
262 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc *p)
263 {
264 struct inpcb *inp;
265 caddr_t temp;
266 #if IPSEC
267 #ifndef __APPLE__
268 int error;
269 #endif
270 #endif
271 #if CONFIG_MACF_NET
272 int mac_error;
273 #endif
274
275 if (so->cached_in_sock_layer == 0) {
276 #if TEMPDEBUG
277 printf("PCBALLOC calling zalloc for socket %x\n", so);
278 #endif
279 inp = (struct inpcb *) zalloc(pcbinfo->ipi_zone);
280 if (inp == NULL)
281 return (ENOBUFS);
282 bzero((caddr_t)inp, sizeof(*inp));
283 }
284 else {
285 #if TEMPDEBUG
286 printf("PCBALLOC reusing PCB for socket %x\n", so);
287 #endif
288 inp = (struct inpcb *)(void *)so->so_saved_pcb;
289 temp = inp->inp_saved_ppcb;
290 bzero((caddr_t) inp, sizeof(*inp));
291 inp->inp_saved_ppcb = temp;
292 }
293
294 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
295 inp->inp_pcbinfo = pcbinfo;
296 inp->inp_socket = so;
297 #if CONFIG_MACF_NET
298 mac_error = mac_inpcb_label_init(inp, M_WAITOK);
299 if (mac_error != 0) {
300 if (so->cached_in_sock_layer == 0)
301 zfree(pcbinfo->ipi_zone, inp);
302 return (mac_error);
303 }
304 mac_inpcb_label_associate(so, inp);
305 #endif
306 // make sure inp_stat is always 64bit aligned
307 inp->inp_stat = (struct inp_stat*)P2ROUNDUP(inp->inp_stat_store, sizeof(u_int64_t));
308 if (((uintptr_t)inp->inp_stat - (uintptr_t)inp->inp_stat_store)
309 + sizeof(*inp->inp_stat) > sizeof(inp->inp_stat_store)) {
310 panic("insufficient space to align inp_stat");
311 }
312
313 so->so_pcb = (caddr_t)inp;
314
315 if (so->so_proto->pr_flags & PR_PCBLOCK) {
316 lck_mtx_init(&inp->inpcb_mtx, pcbinfo->mtx_grp, pcbinfo->mtx_attr);
317 }
318
319 #if IPSEC
320 #ifndef __APPLE__
321 if (ipsec_bypass == 0) {
322 error = ipsec_init_policy(so, &inp->inp_sp);
323 if (error != 0) {
324 zfree(pcbinfo->ipi_zone, inp);
325 return error;
326 }
327 }
328 #endif
329 #endif /*IPSEC*/
330 #if INET6
331 if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on)
332 inp->inp_flags |= IN6P_IPV6_V6ONLY;
333 #endif
334
335 #if INET6
336 if (ip6_auto_flowlabel)
337 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
338 #endif
339 lck_rw_lock_exclusive(pcbinfo->mtx);
340 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
341 LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
342 pcbinfo->ipi_count++;
343 lck_rw_done(pcbinfo->mtx);
344 return (0);
345 }
346
347
348 /*
349 in_pcblookup_local_and_cleanup does everything
350 in_pcblookup_local does but it checks for a socket
351 that's going away. Since we know that the lock is
352 held read+write when this funciton is called, we
353 can safely dispose of this socket like the slow
354 timer would usually do and return NULL. This is
355 great for bind.
356 */
357 struct inpcb*
358 in_pcblookup_local_and_cleanup(
359 struct inpcbinfo *pcbinfo,
360 struct in_addr laddr,
361 u_int lport_arg,
362 int wild_okay)
363 {
364 struct inpcb *inp;
365
366 /* Perform normal lookup */
367 inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
368
369 /* Check if we found a match but it's waiting to be disposed */
370 if (inp && inp->inp_wantcnt == WNT_STOPUSING) {
371 struct socket *so = inp->inp_socket;
372
373 lck_mtx_lock(&inp->inpcb_mtx);
374
375 if (so->so_usecount == 0) {
376 if (inp->inp_state != INPCB_STATE_DEAD)
377 in_pcbdetach(inp);
378 in_pcbdispose(inp);
379 inp = NULL;
380 }
381 else {
382 lck_mtx_unlock(&inp->inpcb_mtx);
383 }
384 }
385
386 return inp;
387 }
388
389 #ifdef __APPLE_API_PRIVATE
390 static void
391 in_pcb_conflict_post_msg(u_int16_t port)
392 {
393 /*
394 * Radar 5523020 send a kernel event notification if a non-participating socket tries to bind
395 * the port a socket who has set SOF_NOTIFYCONFLICT owns.
396 */
397 struct kev_msg ev_msg;
398 struct kev_in_portinuse in_portinuse;
399
400 bzero(&in_portinuse, sizeof(struct kev_in_portinuse));
401 bzero(&ev_msg, sizeof(struct kev_msg));
402 in_portinuse.port = ntohs(port); /* port in host order */
403 in_portinuse.req_pid = proc_selfpid();
404 ev_msg.vendor_code = KEV_VENDOR_APPLE;
405 ev_msg.kev_class = KEV_NETWORK_CLASS;
406 ev_msg.kev_subclass = KEV_INET_SUBCLASS;
407 ev_msg.event_code = KEV_INET_PORTINUSE;
408 ev_msg.dv[0].data_ptr = &in_portinuse;
409 ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
410 ev_msg.dv[1].data_length = 0;
411 kev_post_msg(&ev_msg);
412 }
413 #endif
414 /*
415 * Returns: 0 Success
416 * EADDRNOTAVAIL Address not available.
417 * EINVAL Invalid argument
418 * EAFNOSUPPORT Address family not supported [notdef]
419 * EACCES Permission denied
420 * EADDRINUSE Address in use
421 * EAGAIN Resource unavailable, try again
422 * priv_check_cred:EPERM Operation not permitted
423 */
424 int
425 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
426 {
427 struct socket *so = inp->inp_socket;
428 unsigned short *lastport;
429 struct sockaddr_in *sin;
430 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
431 u_short lport = 0, rand_port = 0;
432 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
433 int error, randomport, conflict = 0;
434 kauth_cred_t cred;
435
436 if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
437 return (EADDRNOTAVAIL);
438 if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY)
439 return (EINVAL);
440 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
441 wild = 1;
442 socket_unlock(so, 0); /* keep reference on socket */
443 lck_rw_lock_exclusive(pcbinfo->mtx);
444 if (nam) {
445 struct ifnet *outif = NULL;
446
447 sin = (struct sockaddr_in *)(void *)nam;
448 if (nam->sa_len != sizeof (*sin)) {
449 lck_rw_done(pcbinfo->mtx);
450 socket_lock(so, 0);
451 return (EINVAL);
452 }
453 #ifdef notdef
454 /*
455 * We should check the family, but old programs
456 * incorrectly fail to initialize it.
457 */
458 if (sin->sin_family != AF_INET) {
459 lck_rw_done(pcbinfo->mtx);
460 socket_lock(so, 0);
461 return (EAFNOSUPPORT);
462 }
463 #endif
464 lport = sin->sin_port;
465 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
466 /*
467 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
468 * allow complete duplication of binding if
469 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
470 * and a multicast address is bound on both
471 * new and duplicated sockets.
472 */
473 if (so->so_options & SO_REUSEADDR)
474 reuseport = SO_REUSEADDR|SO_REUSEPORT;
475 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
476 struct ifaddr *ifa;
477 sin->sin_port = 0; /* yech... */
478 if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin)) == 0) {
479 lck_rw_done(pcbinfo->mtx);
480 socket_lock(so, 0);
481 return (EADDRNOTAVAIL);
482 }
483 else {
484 IFA_LOCK(ifa);
485 outif = ifa->ifa_ifp;
486 IFA_UNLOCK(ifa);
487 IFA_REMREF(ifa);
488 }
489 }
490 if (lport) {
491 struct inpcb *t;
492
493 /* GROSS */
494 #if !CONFIG_EMBEDDED
495 if (ntohs(lport) < IPPORT_RESERVED) {
496 cred = kauth_cred_proc_ref(p);
497 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
498 kauth_cred_unref(&cred);
499 if (error != 0) {
500 lck_rw_done(pcbinfo->mtx);
501 socket_lock(so, 0);
502 return (EACCES);
503 }
504 }
505 #endif
506 if (kauth_cred_getuid(so->so_cred) &&
507 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
508 t = in_pcblookup_local_and_cleanup(inp->inp_pcbinfo,
509 sin->sin_addr, lport, INPLOOKUP_WILDCARD);
510 if (t &&
511 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
512 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
513 (t->inp_socket->so_options &
514 SO_REUSEPORT) == 0) &&
515 (kauth_cred_getuid(so->so_cred) !=
516 kauth_cred_getuid(t->inp_socket->so_cred)) &&
517 ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0) &&
518 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
519 ntohl(t->inp_laddr.s_addr) != INADDR_ANY))
520 {
521 #ifdef __APPLE_API_PRIVATE
522
523 if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
524 conflict = 1;
525
526 lck_rw_done(pcbinfo->mtx);
527
528 if (conflict)
529 in_pcb_conflict_post_msg(lport);
530 #else
531 lck_rw_done(pcbinfo->mtx);
532 #endif /* __APPLE_API_PRIVATE */
533
534 socket_lock(so, 0);
535 return (EADDRINUSE);
536 }
537 }
538 t = in_pcblookup_local_and_cleanup(pcbinfo, sin->sin_addr,
539 lport, wild);
540 if (t &&
541 (reuseport & t->inp_socket->so_options) == 0) {
542 #if INET6
543 if (ntohl(sin->sin_addr.s_addr) !=
544 INADDR_ANY ||
545 ntohl(t->inp_laddr.s_addr) !=
546 INADDR_ANY ||
547 INP_SOCKAF(so) != AF_INET6 ||
548 INP_SOCKAF(t->inp_socket) != AF_INET6)
549 #endif /* INET6 */
550 {
551 #ifdef __APPLE_API_PRIVATE
552
553 if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
554 conflict = 1;
555
556 lck_rw_done(pcbinfo->mtx);
557
558 if (conflict)
559 in_pcb_conflict_post_msg(lport);
560 #else
561 lck_rw_done(pcbinfo->mtx);
562 #endif /* __APPLE_API_PRIVATE */
563 socket_lock(so, 0);
564 return (EADDRINUSE);
565 }
566 }
567 }
568 inp->inp_laddr = sin->sin_addr;
569 inp->inp_last_outifp = outif;
570 }
571 if (lport == 0) {
572 u_short first, last;
573 int count;
574
575 randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
576 (so->so_type == SOCK_STREAM ? tcp_use_randomport : udp_use_randomport);
577
578 inp->inp_flags |= INP_ANONPORT;
579
580 if (inp->inp_flags & INP_HIGHPORT) {
581 first = ipport_hifirstauto; /* sysctl */
582 last = ipport_hilastauto;
583 lastport = &pcbinfo->lasthi;
584 } else if (inp->inp_flags & INP_LOWPORT) {
585 cred = kauth_cred_proc_ref(p);
586 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
587 kauth_cred_unref(&cred);
588 if (error != 0) {
589 lck_rw_done(pcbinfo->mtx);
590 socket_lock(so, 0);
591 return error;
592 }
593 first = ipport_lowfirstauto; /* 1023 */
594 last = ipport_lowlastauto; /* 600 */
595 lastport = &pcbinfo->lastlow;
596 } else {
597 first = ipport_firstauto; /* sysctl */
598 last = ipport_lastauto;
599 lastport = &pcbinfo->lastport;
600 }
601 /* No point in randomizing if only one port is available */
602
603 if (first == last)
604 randomport = 0;
605 /*
606 * Simple check to ensure all ports are not used up causing
607 * a deadlock here.
608 *
609 * We split the two cases (up and down) so that the direction
610 * is not being tested on each round of the loop.
611 */
612 if (first > last) {
613 /*
614 * counting down
615 */
616 if (randomport) {
617 read_random(&rand_port, sizeof(rand_port));
618 *lastport = first - (rand_port % (first - last));
619 }
620 count = first - last;
621
622 do {
623 if (count-- < 0) { /* completely used? */
624 lck_rw_done(pcbinfo->mtx);
625 socket_lock(so, 0);
626 inp->inp_laddr.s_addr = INADDR_ANY;
627 inp->inp_last_outifp = NULL;
628 return (EADDRNOTAVAIL);
629 }
630 --*lastport;
631 if (*lastport > first || *lastport < last)
632 *lastport = first;
633 lport = htons(*lastport);
634 } while (in_pcblookup_local_and_cleanup(pcbinfo,
635 inp->inp_laddr, lport, wild));
636 } else {
637 /*
638 * counting up
639 */
640 if (randomport) {
641 read_random(&rand_port, sizeof(rand_port));
642 *lastport = first + (rand_port % (first - last));
643 }
644 count = last - first;
645
646 do {
647 if (count-- < 0) { /* completely used? */
648 lck_rw_done(pcbinfo->mtx);
649 socket_lock(so, 0);
650 inp->inp_laddr.s_addr = INADDR_ANY;
651 inp->inp_last_outifp = NULL;
652 return (EADDRNOTAVAIL);
653 }
654 ++*lastport;
655 if (*lastport < first || *lastport > last)
656 *lastport = first;
657 lport = htons(*lastport);
658 } while (in_pcblookup_local_and_cleanup(pcbinfo,
659 inp->inp_laddr, lport, wild));
660 }
661 }
662 socket_lock(so, 0);
663 inp->inp_lport = lport;
664 if (in_pcbinshash(inp, 1) != 0) {
665 inp->inp_laddr.s_addr = INADDR_ANY;
666 inp->inp_lport = 0;
667 inp->inp_last_outifp = NULL;
668 lck_rw_done(pcbinfo->mtx);
669 return (EAGAIN);
670 }
671 lck_rw_done(pcbinfo->mtx);
672 sflt_notify(so, sock_evt_bound, NULL);
673 return (0);
674 }
675
676 /*
677 * Transform old in_pcbconnect() into an inner subroutine for new
678 * in_pcbconnect(): Do some validity-checking on the remote
679 * address (in mbuf 'nam') and then determine local host address
680 * (i.e., which interface) to use to access that remote host.
681 *
682 * This preserves definition of in_pcbconnect(), while supporting a
683 * slightly different version for T/TCP. (This is more than
684 * a bit of a kludge, but cleaning up the internal interfaces would
685 * have forced minor changes in every protocol).
686 *
687 * Returns: 0 Success
688 * EINVAL Invalid argument
689 * EAFNOSUPPORT Address family not supported
690 * EADDRNOTAVAIL Address not available
691 */
692 int
693 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
694 struct sockaddr_in *plocal_sin, struct ifnet **outif)
695 {
696 struct in_ifaddr *ia;
697 struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
698
699 if (nam->sa_len != sizeof (*sin))
700 return (EINVAL);
701 if (sin->sin_family != AF_INET)
702 return (EAFNOSUPPORT);
703 if (sin->sin_port == 0)
704 return (EADDRNOTAVAIL);
705
706 lck_rw_lock_shared(in_ifaddr_rwlock);
707 if (!TAILQ_EMPTY(&in_ifaddrhead)) {
708 ia = TAILQ_FIRST(&in_ifaddrhead);
709 /*
710 * If the destination address is INADDR_ANY,
711 * use the primary local address.
712 * If the supplied address is INADDR_BROADCAST,
713 * and the primary interface supports broadcast,
714 * choose the broadcast address for that interface.
715 */
716 IFA_LOCK_SPIN(&ia->ia_ifa);
717 if (sin->sin_addr.s_addr == INADDR_ANY)
718 sin->sin_addr = IA_SIN(ia)->sin_addr;
719 else if (sin->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST &&
720 (ia->ia_ifp->if_flags & IFF_BROADCAST))
721 sin->sin_addr = satosin(&ia->ia_broadaddr)->sin_addr;
722 IFA_UNLOCK(&ia->ia_ifa);
723 ia = NULL;
724 }
725 lck_rw_done(in_ifaddr_rwlock);
726
727 if (inp->inp_laddr.s_addr == INADDR_ANY) {
728 struct route *ro;
729 unsigned int ifscope = IFSCOPE_NONE;
730 unsigned int nocell;
731 /*
732 * If the socket is bound to a specifc interface, the
733 * optional scoped takes precedence over that if it
734 * is set by the caller.
735 */
736 ia = (struct in_ifaddr *)0;
737
738 if (outif != NULL && *outif != NULL)
739 ifscope = (*outif)->if_index;
740 else if (inp->inp_flags & INP_BOUND_IF)
741 ifscope = inp->inp_boundifp->if_index;
742
743 nocell = (inp->inp_flags & INP_NO_IFT_CELLULAR) ? 1 : 0;
744 /*
745 * If route is known or can be allocated now,
746 * our src addr is taken from the i/f, else punt.
747 * Note that we should check the address family of the cached
748 * destination, in case of sharing the cache with IPv6.
749 */
750 ro = &inp->inp_route;
751 if (ro->ro_rt != NULL)
752 RT_LOCK_SPIN(ro->ro_rt);
753 if (ro->ro_rt && (ro->ro_dst.sa_family != AF_INET ||
754 satosin(&ro->ro_dst)->sin_addr.s_addr !=
755 sin->sin_addr.s_addr ||
756 inp->inp_socket->so_options & SO_DONTROUTE ||
757 ro->ro_rt->generation_id != route_generation)) {
758 RT_UNLOCK(ro->ro_rt);
759 rtfree(ro->ro_rt);
760 ro->ro_rt = NULL;
761 }
762 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
763 (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
764 if (ro->ro_rt != NULL)
765 RT_UNLOCK(ro->ro_rt);
766 /* No route yet, so try to acquire one */
767 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
768 ro->ro_dst.sa_family = AF_INET;
769 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
770 ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
771 sin->sin_addr;
772 rtalloc_scoped(ro, ifscope);
773 if (ro->ro_rt != NULL)
774 RT_LOCK_SPIN(ro->ro_rt);
775 }
776 /*
777 * If the route points to a cellular interface and the
778 * caller forbids our using interfaces of such type,
779 * pretend that there is no route.
780 */
781 if (nocell && ro->ro_rt != NULL) {
782 RT_LOCK_ASSERT_HELD(ro->ro_rt);
783 if (ro->ro_rt->rt_ifp->if_type == IFT_CELLULAR) {
784 RT_UNLOCK(ro->ro_rt);
785 rtfree(ro->ro_rt);
786 ro->ro_rt = NULL;
787 soevent(inp->inp_socket,
788 (SO_FILT_HINT_LOCKED |
789 SO_FILT_HINT_IFDENIED));
790 }
791 }
792 /*
793 * If we found a route, use the address
794 * corresponding to the outgoing interface
795 * unless it is the loopback (in case a route
796 * to our address on another net goes to loopback).
797 */
798 if (ro->ro_rt != NULL) {
799 /* Become a regular mutex */
800 RT_CONVERT_LOCK(ro->ro_rt);
801 if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
802 ia = ifatoia(ro->ro_rt->rt_ifa);
803 if (ia) {
804 IFA_ADDREF(&ia->ia_ifa);
805 }
806 }
807 RT_UNLOCK(ro->ro_rt);
808 }
809 if (ia == 0) {
810 u_short fport = sin->sin_port;
811
812 sin->sin_port = 0;
813 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
814 if (ia == 0) {
815 ia = ifatoia(ifa_ifwithnet_scoped(sintosa(sin),
816 ifscope));
817 }
818 sin->sin_port = fport;
819 if (ia == 0) {
820 lck_rw_lock_shared(in_ifaddr_rwlock);
821 ia = TAILQ_FIRST(&in_ifaddrhead);
822 if (ia)
823 IFA_ADDREF(&ia->ia_ifa);
824 lck_rw_done(in_ifaddr_rwlock);
825 }
826 /*
827 * If the source address belongs to a cellular interface
828 * and the socket forbids our using interfaces of such
829 * type, pretend that there is no source address.
830 */
831 if (nocell && ia != NULL &&
832 ia->ia_ifa.ifa_ifp->if_type == IFT_CELLULAR) {
833 IFA_REMREF(&ia->ia_ifa);
834 ia = NULL;
835 soevent(inp->inp_socket,
836 (SO_FILT_HINT_LOCKED |
837 SO_FILT_HINT_IFDENIED));
838 }
839 if (ia == 0)
840 return (EADDRNOTAVAIL);
841 }
842 /*
843 * If the destination address is multicast and an outgoing
844 * interface has been set as a multicast option, use the
845 * address of that interface as our source address.
846 */
847 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
848 inp->inp_moptions != NULL) {
849 struct ip_moptions *imo;
850 struct ifnet *ifp;
851
852 imo = inp->inp_moptions;
853 IMO_LOCK(imo);
854 if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
855 ia->ia_ifp != imo->imo_multicast_ifp)) {
856 ifp = imo->imo_multicast_ifp;
857 if (ia)
858 IFA_REMREF(&ia->ia_ifa);
859 lck_rw_lock_shared(in_ifaddr_rwlock);
860 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
861 if (ia->ia_ifp == ifp)
862 break;
863 }
864 if (ia)
865 IFA_ADDREF(&ia->ia_ifa);
866 lck_rw_done(in_ifaddr_rwlock);
867 if (ia == 0) {
868 IMO_UNLOCK(imo);
869 return (EADDRNOTAVAIL);
870 }
871 }
872 IMO_UNLOCK(imo);
873 }
874 /*
875 * Don't do pcblookup call here; return interface in plocal_sin
876 * and exit to caller, that will do the lookup.
877 */
878 IFA_LOCK_SPIN(&ia->ia_ifa);
879 *plocal_sin = ia->ia_addr;
880 if (outif != NULL)
881 *outif = ia->ia_ifp;
882 IFA_UNLOCK(&ia->ia_ifa);
883 IFA_REMREF(&ia->ia_ifa);
884 }
885 return(0);
886 }
887
888 /*
889 * Outer subroutine:
890 * Connect from a socket to a specified address.
891 * Both address and port must be specified in argument sin.
892 * If don't have a local address for this socket yet,
893 * then pick one.
894 */
895 int
896 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
897 struct ifnet **outif)
898 {
899 struct sockaddr_in ifaddr;
900 struct sockaddr_in *sin = (struct sockaddr_in *)(void *)nam;
901 struct inpcb *pcb;
902 int error;
903
904 /*
905 * Call inner routine, to assign local interface address.
906 */
907 if ((error = in_pcbladdr(inp, nam, &ifaddr, outif)) != 0)
908 return(error);
909
910 socket_unlock(inp->inp_socket, 0);
911 pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
912 inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr.sin_addr,
913 inp->inp_lport, 0, NULL);
914 socket_lock(inp->inp_socket, 0);
915
916 /* Check if the socket is still in a valid state. When we unlock this
917 * embryonic socket, it can get aborted if another thread is closing
918 * the listener (radar 7947600).
919 */
920 if ((inp->inp_socket->so_flags & SOF_ABORTED) != 0) {
921 return ECONNREFUSED;
922 }
923
924 if (pcb != NULL) {
925 in_pcb_checkstate(pcb, WNT_RELEASE, pcb == inp ? 1 : 0);
926 return (EADDRINUSE);
927 }
928 if (inp->inp_laddr.s_addr == INADDR_ANY) {
929 if (inp->inp_lport == 0) {
930 error = in_pcbbind(inp, (struct sockaddr *)0, p);
931 if (error)
932 return (error);
933 }
934 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
935 /*lock inversion issue, mostly with udp multicast packets */
936 socket_unlock(inp->inp_socket, 0);
937 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
938 socket_lock(inp->inp_socket, 0);
939 }
940 inp->inp_laddr = ifaddr.sin_addr;
941 inp->inp_last_outifp = (outif != NULL) ? *outif : NULL;
942 inp->inp_flags |= INP_INADDR_ANY;
943 }
944 else {
945 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
946 /*lock inversion issue, mostly with udp multicast packets */
947 socket_unlock(inp->inp_socket, 0);
948 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
949 socket_lock(inp->inp_socket, 0);
950 }
951 }
952 inp->inp_faddr = sin->sin_addr;
953 inp->inp_fport = sin->sin_port;
954 in_pcbrehash(inp);
955 lck_rw_done(inp->inp_pcbinfo->mtx);
956 return (0);
957 }
958
959 void
960 in_pcbdisconnect(struct inpcb *inp)
961 {
962
963 inp->inp_faddr.s_addr = INADDR_ANY;
964 inp->inp_fport = 0;
965
966 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
967 /*lock inversion issue, mostly with udp multicast packets */
968 socket_unlock(inp->inp_socket, 0);
969 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
970 socket_lock(inp->inp_socket, 0);
971 }
972
973 in_pcbrehash(inp);
974 lck_rw_done(inp->inp_pcbinfo->mtx);
975
976 if (inp->inp_socket->so_state & SS_NOFDREF)
977 in_pcbdetach(inp);
978 }
979
980 void
981 in_pcbdetach(struct inpcb *inp)
982 {
983 struct socket *so = inp->inp_socket;
984
985 if (so->so_pcb == 0) { /* we've been called twice */
986 panic("in_pcbdetach: inp=%p so=%p proto=%d so_pcb is null!\n",
987 inp, so, so->so_proto->pr_protocol);
988 }
989
990 #if IPSEC
991 if (ipsec_bypass == 0) {
992 ipsec4_delete_pcbpolicy(inp);
993 }
994 #endif /*IPSEC*/
995
996 /* mark socket state as dead */
997 if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING)
998 panic("in_pcbdetach so=%p prot=%x couldn't set to STOPUSING\n", so, so->so_proto->pr_protocol);
999
1000 #if TEMPDEBUG
1001 if (so->cached_in_sock_layer)
1002 printf("in_pcbdetach for cached socket %x flags=%x\n", so, so->so_flags);
1003 else
1004 printf("in_pcbdetach for allocated socket %x flags=%x\n", so, so->so_flags);
1005 #endif
1006 if ((so->so_flags & SOF_PCBCLEARING) == 0) {
1007 struct rtentry *rt;
1008 struct ip_moptions *imo;
1009
1010 inp->inp_vflag = 0;
1011 if (inp->inp_options)
1012 (void)m_free(inp->inp_options);
1013 if ((rt = inp->inp_route.ro_rt) != NULL) {
1014 inp->inp_route.ro_rt = NULL;
1015 rtfree(rt);
1016 }
1017 imo = inp->inp_moptions;
1018 inp->inp_moptions = NULL;
1019 if (imo != NULL)
1020 IMO_REMREF(imo);
1021 sofreelastref(so, 0);
1022 inp->inp_state = INPCB_STATE_DEAD;
1023 so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
1024 }
1025 }
1026
1027
1028 void
1029 in_pcbdispose(struct inpcb *inp)
1030 {
1031 struct socket *so = inp->inp_socket;
1032 struct inpcbinfo *ipi = inp->inp_pcbinfo;
1033
1034 #if TEMPDEBUG
1035 if (inp->inp_state != INPCB_STATE_DEAD) {
1036 printf("in_pcbdispose: not dead yet? so=%p\n", so);
1037 }
1038 #endif
1039 if (so && so->so_usecount != 0)
1040 panic("%s: so %p so_usecount %d so_lockhistory %s\n",
1041 __func__, so, so->so_usecount,
1042 (so != NULL) ? solockhistory_nr(so) : "--");
1043
1044 lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE);
1045
1046 inp->inp_gencnt = ++ipi->ipi_gencnt;
1047 /* access ipi in in_pcbremlists */
1048 in_pcbremlists(inp);
1049
1050 if (so) {
1051 if (so->so_proto->pr_flags & PR_PCBLOCK) {
1052 sofreelastref(so, 0);
1053 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
1054 #if TEMPDEBUG
1055 printf("in_pcbdispose sb not cleaned up so=%p rc_cci=%x snd_cc=%x\n",
1056 so, so->so_rcv.sb_cc, so->so_snd.sb_cc);
1057 #endif
1058 sbrelease(&so->so_rcv);
1059 sbrelease(&so->so_snd);
1060 }
1061 if (so->so_head != NULL)
1062 panic("in_pcbdispose, so=%p head still exist\n", so);
1063 lck_mtx_unlock(&inp->inpcb_mtx);
1064 lck_mtx_destroy(&inp->inpcb_mtx, ipi->mtx_grp);
1065 }
1066 so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
1067 so->so_saved_pcb = (caddr_t) inp;
1068 so->so_pcb = 0;
1069 inp->inp_socket = 0;
1070 #if CONFIG_MACF_NET
1071 mac_inpcb_label_destroy(inp);
1072 #endif
1073 /*
1074 * In case there a route cached after a detach (possible
1075 * in the tcp case), make sure that it is freed before
1076 * we deallocate the structure.
1077 */
1078 if (inp->inp_route.ro_rt != NULL) {
1079 rtfree(inp->inp_route.ro_rt);
1080 inp->inp_route.ro_rt = NULL;
1081 }
1082 if (so->cached_in_sock_layer == 0) {
1083 zfree(ipi->ipi_zone, inp);
1084 }
1085 sodealloc(so);
1086 }
1087 #if TEMPDEBUG
1088 else
1089 printf("in_pcbdispose: no socket for inp=%p\n", inp);
1090 #endif
1091 }
1092
1093 /*
1094 * The calling convention of in_setsockaddr() and in_setpeeraddr() was
1095 * modified to match the pru_sockaddr() and pru_peeraddr() entry points
1096 * in struct pr_usrreqs, so that protocols can just reference then directly
1097 * without the need for a wrapper function. The socket must have a valid
1098 * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
1099 * except through a kernel programming error, so it is acceptable to panic
1100 * (or in this case trap) if the PCB is invalid. (Actually, we don't trap
1101 * because there actually /is/ a programming error somewhere... XXX)
1102 *
1103 * Returns: 0 Success
1104 * ENOBUFS No buffer space available
1105 * ECONNRESET Connection reset
1106 */
1107 int
1108 in_setsockaddr(struct socket *so, struct sockaddr **nam)
1109 {
1110 struct inpcb *inp;
1111 struct sockaddr_in *sin;
1112
1113 /*
1114 * Do the malloc first in case it blocks.
1115 */
1116 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
1117 if (sin == NULL)
1118 return ENOBUFS;
1119 bzero(sin, sizeof *sin);
1120 sin->sin_family = AF_INET;
1121 sin->sin_len = sizeof(*sin);
1122
1123 inp = sotoinpcb(so);
1124 if (!inp) {
1125 FREE(sin, M_SONAME);
1126 return ECONNRESET;
1127 }
1128 sin->sin_port = inp->inp_lport;
1129 sin->sin_addr = inp->inp_laddr;
1130
1131 *nam = (struct sockaddr *)sin;
1132 return 0;
1133 }
1134
1135 int
1136 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
1137 {
1138 struct inpcb *inp;
1139 struct sockaddr_in *sin;
1140
1141 /*
1142 * Do the malloc first in case it blocks.
1143 */
1144 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
1145 if (sin == NULL)
1146 return ENOBUFS;
1147 bzero((caddr_t)sin, sizeof (*sin));
1148 sin->sin_family = AF_INET;
1149 sin->sin_len = sizeof(*sin);
1150
1151 inp = sotoinpcb(so);
1152 if (!inp) {
1153 FREE(sin, M_SONAME);
1154 return ECONNRESET;
1155 }
1156 sin->sin_port = inp->inp_fport;
1157 sin->sin_addr = inp->inp_faddr;
1158
1159 *nam = (struct sockaddr *)sin;
1160 return 0;
1161 }
1162
1163 void
1164 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1165 int errno, void (*notify)(struct inpcb *, int))
1166 {
1167 struct inpcb *inp;
1168
1169 lck_rw_lock_shared(pcbinfo->mtx);
1170
1171 LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
1172 #if INET6
1173 if ((inp->inp_vflag & INP_IPV4) == 0)
1174 continue;
1175 #endif
1176 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1177 inp->inp_socket == NULL)
1178 continue;
1179 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
1180 continue;
1181 socket_lock(inp->inp_socket, 1);
1182 (*notify)(inp, errno);
1183 (void)in_pcb_checkstate(inp, WNT_RELEASE, 1);
1184 socket_unlock(inp->inp_socket, 1);
1185 }
1186 lck_rw_done(pcbinfo->mtx);
1187 }
1188
1189 /*
1190 * Check for alternatives when higher level complains
1191 * about service problems. For now, invalidate cached
1192 * routing information. If the route was created dynamically
1193 * (by a redirect), time to try a default gateway again.
1194 */
1195 void
1196 in_losing(struct inpcb *inp)
1197 {
1198 struct rtentry *rt;
1199 struct rt_addrinfo info;
1200
1201 if ((rt = inp->inp_route.ro_rt) != NULL) {
1202 struct in_ifaddr *ia;
1203
1204 bzero((caddr_t)&info, sizeof(info));
1205 RT_LOCK(rt);
1206 info.rti_info[RTAX_DST] =
1207 (struct sockaddr *)&inp->inp_route.ro_dst;
1208 info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1209 info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1210 rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
1211 if (rt->rt_flags & RTF_DYNAMIC) {
1212 /*
1213 * Prevent another thread from modifying rt_key,
1214 * rt_gateway via rt_setgate() after rt_lock is
1215 * dropped by marking the route as defunct.
1216 */
1217 rt->rt_flags |= RTF_CONDEMNED;
1218 RT_UNLOCK(rt);
1219 (void) rtrequest(RTM_DELETE, rt_key(rt),
1220 rt->rt_gateway, rt_mask(rt), rt->rt_flags,
1221 (struct rtentry **)0);
1222 } else {
1223 RT_UNLOCK(rt);
1224 }
1225 /* if the address is gone keep the old route in the pcb */
1226 if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
1227 inp->inp_route.ro_rt = NULL;
1228 rtfree(rt);
1229 IFA_REMREF(&ia->ia_ifa);
1230 }
1231 /*
1232 * A new route can be allocated
1233 * the next time output is attempted.
1234 */
1235 }
1236 }
1237
1238 /*
1239 * After a routing change, flush old routing
1240 * and allocate a (hopefully) better one.
1241 */
1242 void
1243 in_rtchange(struct inpcb *inp, __unused int errno)
1244 {
1245 struct rtentry *rt;
1246
1247 if ((rt = inp->inp_route.ro_rt) != NULL) {
1248 struct in_ifaddr *ia;
1249
1250 if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) {
1251 return; /* we can't remove the route now. not sure if still ok to use src */
1252 }
1253 IFA_REMREF(&ia->ia_ifa);
1254 rtfree(rt);
1255 inp->inp_route.ro_rt = NULL;
1256 /*
1257 * A new route can be allocated the next time
1258 * output is attempted.
1259 */
1260 }
1261 }
1262
1263 /*
1264 * Lookup a PCB based on the local address and port.
1265 */
1266 struct inpcb *
1267 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1268 unsigned int lport_arg, int wild_okay)
1269 {
1270 struct inpcb *inp;
1271 int matchwild = 3, wildcard;
1272 u_short lport = lport_arg;
1273
1274 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0,0,0,0,0);
1275
1276 if (!wild_okay) {
1277 struct inpcbhead *head;
1278 /*
1279 * Look for an unconnected (wildcard foreign addr) PCB that
1280 * matches the local address and port we're looking for.
1281 */
1282 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1283 LIST_FOREACH(inp, head, inp_hash) {
1284 #if INET6
1285 if ((inp->inp_vflag & INP_IPV4) == 0)
1286 continue;
1287 #endif
1288 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1289 inp->inp_laddr.s_addr == laddr.s_addr &&
1290 inp->inp_lport == lport) {
1291 /*
1292 * Found.
1293 */
1294 return (inp);
1295 }
1296 }
1297 /*
1298 * Not found.
1299 */
1300 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0,0,0,0,0);
1301 return (NULL);
1302 } else {
1303 struct inpcbporthead *porthash;
1304 struct inpcbport *phd;
1305 struct inpcb *match = NULL;
1306 /*
1307 * Best fit PCB lookup.
1308 *
1309 * First see if this local port is in use by looking on the
1310 * port hash list.
1311 */
1312 porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
1313 pcbinfo->porthashmask)];
1314 LIST_FOREACH(phd, porthash, phd_hash) {
1315 if (phd->phd_port == lport)
1316 break;
1317 }
1318 if (phd != NULL) {
1319 /*
1320 * Port is in use by one or more PCBs. Look for best
1321 * fit.
1322 */
1323 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1324 wildcard = 0;
1325 #if INET6
1326 if ((inp->inp_vflag & INP_IPV4) == 0)
1327 continue;
1328 #endif
1329 if (inp->inp_faddr.s_addr != INADDR_ANY)
1330 wildcard++;
1331 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1332 if (laddr.s_addr == INADDR_ANY)
1333 wildcard++;
1334 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1335 continue;
1336 } else {
1337 if (laddr.s_addr != INADDR_ANY)
1338 wildcard++;
1339 }
1340 if (wildcard < matchwild) {
1341 match = inp;
1342 matchwild = wildcard;
1343 if (matchwild == 0) {
1344 break;
1345 }
1346 }
1347 }
1348 }
1349 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,0,0,0,0);
1350 return (match);
1351 }
1352 }
1353
1354 /*
1355 * Check if PCB exists in hash list.
1356 */
1357 int
1358 in_pcblookup_hash_exists(
1359 struct inpcbinfo *pcbinfo,
1360 struct in_addr faddr,
1361 u_int fport_arg,
1362 struct in_addr laddr,
1363 u_int lport_arg,
1364 int wildcard,
1365 uid_t *uid,
1366 gid_t *gid,
1367 struct ifnet *ifp)
1368 {
1369 struct inpcbhead *head;
1370 struct inpcb *inp;
1371 u_short fport = fport_arg, lport = lport_arg;
1372 int found;
1373
1374 *uid = UID_MAX;
1375 *gid = GID_MAX;
1376
1377 /*
1378 * We may have found the pcb in the last lookup - check this first.
1379 */
1380
1381 lck_rw_lock_shared(pcbinfo->mtx);
1382
1383 /*
1384 * First look for an exact match.
1385 */
1386 head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1387 pcbinfo->hashmask)];
1388 LIST_FOREACH(inp, head, inp_hash) {
1389 #if INET6
1390 if ((inp->inp_vflag & INP_IPV4) == 0)
1391 continue;
1392 #endif
1393 if (ip_restrictrecvif && ifp != NULL &&
1394 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1395 !(inp->inp_flags & INP_RECV_ANYIF))
1396 continue;
1397
1398 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1399 inp->inp_laddr.s_addr == laddr.s_addr &&
1400 inp->inp_fport == fport &&
1401 inp->inp_lport == lport) {
1402 if ((found = (inp->inp_socket != NULL))) {
1403 /*
1404 * Found.
1405 */
1406 *uid = kauth_cred_getuid(
1407 inp->inp_socket->so_cred);
1408 *gid = kauth_cred_getgid(
1409 inp->inp_socket->so_cred);
1410 }
1411 lck_rw_done(pcbinfo->mtx);
1412 return (found);
1413 }
1414 }
1415 if (wildcard) {
1416 struct inpcb *local_wild = NULL;
1417 #if INET6
1418 struct inpcb *local_wild_mapped = NULL;
1419 #endif
1420
1421 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0,
1422 pcbinfo->hashmask)];
1423 LIST_FOREACH(inp, head, inp_hash) {
1424 #if INET6
1425 if ((inp->inp_vflag & INP_IPV4) == 0)
1426 continue;
1427 #endif
1428 if (ip_restrictrecvif && ifp != NULL &&
1429 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1430 !(inp->inp_flags & INP_RECV_ANYIF))
1431 continue;
1432
1433 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1434 inp->inp_lport == lport) {
1435 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1436 if ((found = (inp->inp_socket != NULL))) {
1437 *uid = kauth_cred_getuid(
1438 inp->inp_socket->so_cred);
1439 *gid = kauth_cred_getgid(
1440 inp->inp_socket->so_cred);
1441 }
1442 lck_rw_done(pcbinfo->mtx);
1443 return (found);
1444 }
1445 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1446 #if INET6
1447 if (inp->inp_socket &&
1448 INP_CHECK_SOCKAF(inp->inp_socket,
1449 AF_INET6))
1450 local_wild_mapped = inp;
1451 else
1452 #endif /* INET6 */
1453 local_wild = inp;
1454 }
1455 }
1456 }
1457 if (local_wild == NULL) {
1458 #if INET6
1459 if (local_wild_mapped != NULL) {
1460 if ((found = (local_wild_mapped->inp_socket != NULL))) {
1461 *uid = kauth_cred_getuid(
1462 local_wild_mapped->inp_socket->so_cred);
1463 *gid = kauth_cred_getgid(
1464 local_wild_mapped->inp_socket->so_cred);
1465 }
1466 lck_rw_done(pcbinfo->mtx);
1467 return (found);
1468 }
1469 #endif /* INET6 */
1470 lck_rw_done(pcbinfo->mtx);
1471 return (0);
1472 }
1473 if (local_wild != NULL) {
1474 if ((found = (local_wild->inp_socket != NULL))) {
1475 *uid = kauth_cred_getuid(
1476 local_wild->inp_socket->so_cred);
1477 *gid = kauth_cred_getgid(
1478 local_wild->inp_socket->so_cred);
1479 }
1480 lck_rw_done(pcbinfo->mtx);
1481 return (found);
1482 }
1483 }
1484
1485 /*
1486 * Not found.
1487 */
1488 lck_rw_done(pcbinfo->mtx);
1489 return (0);
1490 }
1491
1492 /*
1493 * Lookup PCB in hash list.
1494 */
1495 struct inpcb *
1496 in_pcblookup_hash(
1497 struct inpcbinfo *pcbinfo,
1498 struct in_addr faddr,
1499 u_int fport_arg,
1500 struct in_addr laddr,
1501 u_int lport_arg,
1502 int wildcard,
1503 struct ifnet *ifp)
1504 {
1505 struct inpcbhead *head;
1506 struct inpcb *inp;
1507 u_short fport = fport_arg, lport = lport_arg;
1508
1509 /*
1510 * We may have found the pcb in the last lookup - check this first.
1511 */
1512
1513 lck_rw_lock_shared(pcbinfo->mtx);
1514
1515 /*
1516 * First look for an exact match.
1517 */
1518 head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
1519 LIST_FOREACH(inp, head, inp_hash) {
1520 #if INET6
1521 if ((inp->inp_vflag & INP_IPV4) == 0)
1522 continue;
1523 #endif
1524 if (ip_restrictrecvif && ifp != NULL &&
1525 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1526 !(inp->inp_flags & INP_RECV_ANYIF))
1527 continue;
1528
1529 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1530 inp->inp_laddr.s_addr == laddr.s_addr &&
1531 inp->inp_fport == fport &&
1532 inp->inp_lport == lport) {
1533 /*
1534 * Found.
1535 */
1536 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1537 lck_rw_done(pcbinfo->mtx);
1538 return (inp);
1539 }
1540 else { /* it's there but dead, say it isn't found */
1541 lck_rw_done(pcbinfo->mtx);
1542 return (NULL);
1543 }
1544 }
1545 }
1546 if (wildcard) {
1547 struct inpcb *local_wild = NULL;
1548 #if INET6
1549 struct inpcb *local_wild_mapped = NULL;
1550 #endif
1551
1552 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1553 LIST_FOREACH(inp, head, inp_hash) {
1554 #if INET6
1555 if ((inp->inp_vflag & INP_IPV4) == 0)
1556 continue;
1557 #endif
1558 if (ip_restrictrecvif && ifp != NULL &&
1559 (ifp->if_eflags & IFEF_RESTRICTED_RECV) &&
1560 !(inp->inp_flags & INP_RECV_ANYIF))
1561 continue;
1562
1563 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1564 inp->inp_lport == lport) {
1565 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1566 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1567 lck_rw_done(pcbinfo->mtx);
1568 return (inp);
1569 }
1570 else { /* it's there but dead, say it isn't found */
1571 lck_rw_done(pcbinfo->mtx);
1572 return (NULL);
1573 }
1574 }
1575 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1576 #if INET6
1577 if (INP_CHECK_SOCKAF(inp->inp_socket,
1578 AF_INET6))
1579 local_wild_mapped = inp;
1580 else
1581 #endif /* INET6 */
1582 local_wild = inp;
1583 }
1584 }
1585 }
1586 if (local_wild == NULL) {
1587 #if INET6
1588 if (local_wild_mapped != NULL) {
1589 if (in_pcb_checkstate(local_wild_mapped, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1590 lck_rw_done(pcbinfo->mtx);
1591 return (local_wild_mapped);
1592 }
1593 else { /* it's there but dead, say it isn't found */
1594 lck_rw_done(pcbinfo->mtx);
1595 return (NULL);
1596 }
1597 }
1598 #endif /* INET6 */
1599 lck_rw_done(pcbinfo->mtx);
1600 return (NULL);
1601 }
1602 if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1603 lck_rw_done(pcbinfo->mtx);
1604 return (local_wild);
1605 }
1606 else { /* it's there but dead, say it isn't found */
1607 lck_rw_done(pcbinfo->mtx);
1608 return (NULL);
1609 }
1610 }
1611
1612 /*
1613 * Not found.
1614 */
1615 lck_rw_done(pcbinfo->mtx);
1616 return (NULL);
1617 }
1618
1619 /*
1620 * Insert PCB onto various hash lists.
1621 */
1622 int
1623 in_pcbinshash(struct inpcb *inp, int locked)
1624 {
1625 struct inpcbhead *pcbhash;
1626 struct inpcbporthead *pcbporthash;
1627 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1628 struct inpcbport *phd;
1629 u_int32_t hashkey_faddr;
1630
1631 if (!locked) {
1632 if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
1633 /*lock inversion issue, mostly with udp multicast packets */
1634 socket_unlock(inp->inp_socket, 0);
1635 lck_rw_lock_exclusive(pcbinfo->mtx);
1636 socket_lock(inp->inp_socket, 0);
1637 if (inp->inp_state == INPCB_STATE_DEAD) {
1638 /* The socket got dropped when it was unlocked */
1639 lck_rw_done(pcbinfo->mtx);
1640 return(ECONNABORTED);
1641 }
1642 }
1643 }
1644
1645 #if INET6
1646 if (inp->inp_vflag & INP_IPV6)
1647 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1648 else
1649 #endif /* INET6 */
1650 hashkey_faddr = inp->inp_faddr.s_addr;
1651
1652 inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->hashmask);
1653
1654 pcbhash = &pcbinfo->hashbase[inp->hash_element];
1655
1656 pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1657 pcbinfo->porthashmask)];
1658
1659 /*
1660 * Go through port list and look for a head for this lport.
1661 */
1662 LIST_FOREACH(phd, pcbporthash, phd_hash) {
1663 if (phd->phd_port == inp->inp_lport)
1664 break;
1665 }
1666
1667 VERIFY(inp->inp_state != INPCB_STATE_DEAD);
1668
1669 /*
1670 * If none exists, malloc one and tack it on.
1671 */
1672 if (phd == NULL) {
1673 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_WAITOK);
1674 if (phd == NULL) {
1675 if (!locked)
1676 lck_rw_done(pcbinfo->mtx);
1677 return (ENOBUFS); /* XXX */
1678 }
1679 phd->phd_port = inp->inp_lport;
1680 LIST_INIT(&phd->phd_pcblist);
1681 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1682 }
1683 inp->inp_phd = phd;
1684 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1685 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1686 if (!locked)
1687 lck_rw_done(pcbinfo->mtx);
1688 return (0);
1689 }
1690
1691 /*
1692 * Move PCB to the proper hash bucket when { faddr, fport } have been
1693 * changed. NOTE: This does not handle the case of the lport changing (the
1694 * hashed port list would have to be updated as well), so the lport must
1695 * not change after in_pcbinshash() has been called.
1696 */
1697 void
1698 in_pcbrehash(struct inpcb *inp)
1699 {
1700 struct inpcbhead *head;
1701 u_int32_t hashkey_faddr;
1702
1703 #if INET6
1704 if (inp->inp_vflag & INP_IPV6)
1705 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1706 else
1707 #endif /* INET6 */
1708 hashkey_faddr = inp->inp_faddr.s_addr;
1709 inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
1710 inp->inp_fport, inp->inp_pcbinfo->hashmask);
1711 head = &inp->inp_pcbinfo->hashbase[inp->hash_element];
1712
1713 LIST_REMOVE(inp, inp_hash);
1714 LIST_INSERT_HEAD(head, inp, inp_hash);
1715 }
1716
1717 /*
1718 * Remove PCB from various lists.
1719 * Must be called pcbinfo lock is held in exclusive mode.
1720 */
1721 void
1722 in_pcbremlists(struct inpcb *inp)
1723 {
1724 struct inp_fc_entry *infce;
1725 inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
1726
1727 if (inp->inp_lport) {
1728 struct inpcbport *phd = inp->inp_phd;
1729
1730 LIST_REMOVE(inp, inp_hash);
1731 LIST_REMOVE(inp, inp_portlist);
1732 if (phd != NULL && (LIST_FIRST(&phd->phd_pcblist) == NULL)) {
1733 LIST_REMOVE(phd, phd_hash);
1734 FREE(phd, M_PCB);
1735 }
1736 }
1737 LIST_REMOVE(inp, inp_list);
1738
1739 infce = inp_fc_getinp(inp->inp_flowhash);
1740 if (infce != NULL)
1741 inp_fc_entry_free(infce);
1742
1743 inp->inp_pcbinfo->ipi_count--;
1744 }
1745
1746 /* Mechanism used to defer the memory release of PCBs
1747 * The pcb list will contain the pcb until the ripper can clean it up if
1748 * the following conditions are met: 1) state "DEAD", 2) wantcnt is STOPUSING
1749 * 3) usecount is null
1750 * This function will be called to either mark the pcb as
1751 */
1752 int
1753 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
1754 {
1755
1756 volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
1757 UInt32 origwant;
1758 UInt32 newwant;
1759
1760 switch (mode) {
1761
1762 case WNT_STOPUSING: /* try to mark the pcb as ready for recycling */
1763
1764 /* compareswap with STOPUSING, if success we're good, if it's in use, will be marked later */
1765
1766 if (locked == 0)
1767 socket_lock(pcb->inp_socket, 1);
1768 pcb->inp_state = INPCB_STATE_DEAD;
1769
1770 stopusing:
1771 if (pcb->inp_socket->so_usecount < 0)
1772 panic("in_pcb_checkstate STOP pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1773 if (locked == 0)
1774 socket_unlock(pcb->inp_socket, 1);
1775
1776 origwant = *wantcnt;
1777 if ((UInt16) origwant == 0xffff ) /* should stop using */
1778 return (WNT_STOPUSING);
1779 newwant = 0xffff;
1780 if ((UInt16) origwant == 0) {/* try to mark it as unsuable now */
1781 OSCompareAndSwap(origwant, newwant, wantcnt) ;
1782 }
1783 return (WNT_STOPUSING);
1784 break;
1785
1786 case WNT_ACQUIRE: /* try to increase reference to pcb */
1787 /* if WNT_STOPUSING should bail out */
1788 /*
1789 * if socket state DEAD, try to set count to STOPUSING, return failed
1790 * otherwise increase cnt
1791 */
1792 do {
1793 origwant = *wantcnt;
1794 if ((UInt16) origwant == 0xffff ) {/* should stop using */
1795 // printf("in_pcb_checkstate: ACQ PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1796 return (WNT_STOPUSING);
1797 }
1798 newwant = origwant + 1;
1799 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1800 return (WNT_ACQUIRE);
1801 break;
1802
1803 case WNT_RELEASE: /* release reference. if result is null and pcb state is DEAD,
1804 set wanted bit to STOPUSING
1805 */
1806
1807 if (locked == 0)
1808 socket_lock(pcb->inp_socket, 1);
1809
1810 do {
1811 origwant = *wantcnt;
1812 if ((UInt16) origwant == 0x0 )
1813 panic("in_pcb_checkstate pcb=%p release with zero count", pcb);
1814 if ((UInt16) origwant == 0xffff ) {/* should stop using */
1815 #if TEMPDEBUG
1816 printf("in_pcb_checkstate: REL PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1817 #endif
1818 if (locked == 0)
1819 socket_unlock(pcb->inp_socket, 1);
1820 return (WNT_STOPUSING);
1821 }
1822 newwant = origwant - 1;
1823 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1824
1825 if (pcb->inp_state == INPCB_STATE_DEAD)
1826 goto stopusing;
1827 if (pcb->inp_socket->so_usecount < 0)
1828 panic("in_pcb_checkstate RELEASE pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1829
1830 if (locked == 0)
1831 socket_unlock(pcb->inp_socket, 1);
1832 return (WNT_RELEASE);
1833 break;
1834
1835 default:
1836
1837 panic("in_pcb_checkstate: so=%p not a valid state =%x\n", pcb->inp_socket, mode);
1838 }
1839
1840 /* NOTREACHED */
1841 return (mode);
1842 }
1843
1844 /*
1845 * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
1846 * The inpcb_compat data structure is passed to user space and must
1847 * not change. We intentionally avoid copying pointers.
1848 */
1849 void
1850 inpcb_to_compat(
1851 struct inpcb *inp,
1852 struct inpcb_compat *inp_compat)
1853 {
1854 bzero(inp_compat, sizeof(*inp_compat));
1855 inp_compat->inp_fport = inp->inp_fport;
1856 inp_compat->inp_lport = inp->inp_lport;
1857 inp_compat->nat_owner = 0;
1858 inp_compat->nat_cookie = inp->nat_cookie;
1859 inp_compat->inp_gencnt = inp->inp_gencnt;
1860 inp_compat->inp_flags = inp->inp_flags;
1861 inp_compat->inp_flow = inp->inp_flow;
1862 inp_compat->inp_vflag = inp->inp_vflag;
1863 inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
1864 inp_compat->inp_ip_p = inp->inp_ip_p;
1865 inp_compat->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1866 inp_compat->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1867 inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1868 inp_compat->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1869 inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1870 inp_compat->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1871 inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1872 }
1873
1874 #if !CONFIG_EMBEDDED
1875
1876 void
1877 inpcb_to_xinpcb64(
1878 struct inpcb *inp,
1879 struct xinpcb64 *xinp)
1880 {
1881 xinp->inp_fport = inp->inp_fport;
1882 xinp->inp_lport = inp->inp_lport;
1883 xinp->inp_gencnt = inp->inp_gencnt;
1884 xinp->inp_flags = inp->inp_flags;
1885 xinp->inp_flow = inp->inp_flow;
1886 xinp->inp_vflag = inp->inp_vflag;
1887 xinp->inp_ip_ttl = inp->inp_ip_ttl;
1888 xinp->inp_ip_p = inp->inp_ip_p;
1889 xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1890 xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1891 xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1892 xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1893 xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1894 xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1895 xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1896 }
1897
1898 #endif /* !CONFIG_EMBEDDED */
1899
1900
1901 /*
1902 * The following routines implement this scheme:
1903 *
1904 * Callers of ip_output() that intend to cache the route in the inpcb pass
1905 * a local copy of the struct route to ip_output(). Using a local copy of
1906 * the cached route significantly simplifies things as IP no longer has to
1907 * worry about having exclusive access to the passed in struct route, since
1908 * it's defined in the caller's stack; in essence, this allows for a lock-
1909 * less operation when updating the struct route at the IP level and below,
1910 * whenever necessary. The scheme works as follows:
1911 *
1912 * Prior to dropping the socket's lock and calling ip_output(), the caller
1913 * copies the struct route from the inpcb into its stack, and adds a reference
1914 * to the cached route entry, if there was any. The socket's lock is then
1915 * dropped and ip_output() is called with a pointer to the copy of struct
1916 * route defined on the stack (not to the one in the inpcb.)
1917 *
1918 * Upon returning from ip_output(), the caller then acquires the socket's
1919 * lock and synchronizes the cache; if there is no route cached in the inpcb,
1920 * it copies the local copy of struct route (which may or may not contain any
1921 * route) back into the cache; otherwise, if the inpcb has a route cached in
1922 * it, the one in the local copy will be freed, if there's any. Trashing the
1923 * cached route in the inpcb can be avoided because ip_output() is single-
1924 * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
1925 * by the socket/transport layer.)
1926 */
1927 void
1928 inp_route_copyout(struct inpcb *inp, struct route *dst)
1929 {
1930 struct route *src = &inp->inp_route;
1931
1932 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1933
1934 /*
1935 * If the route in the PCB is not for IPv4, blow it away;
1936 * this is possible in the case of IPv4-mapped address case.
1937 */
1938 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
1939 rtfree(src->ro_rt);
1940 src->ro_rt = NULL;
1941 }
1942
1943 route_copyout(dst, src, sizeof(*dst));
1944 }
1945
1946 void
1947 inp_route_copyin(struct inpcb *inp, struct route *src)
1948 {
1949 struct route *dst = &inp->inp_route;
1950
1951 lck_mtx_assert(&inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1952
1953 /* Minor sanity check */
1954 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
1955 panic("%s: wrong or corrupted route: %p", __func__, src);
1956
1957 route_copyin(src, dst, sizeof(*src));
1958 }
1959
1960 /*
1961 * Handler for setting IP_FORCE_OUT_IFP/IP_BOUND_IF/IPV6_BOUND_IF socket option.
1962 */
1963 int
1964 inp_bindif(struct inpcb *inp, unsigned int ifscope)
1965 {
1966 struct ifnet *ifp = NULL;
1967
1968 ifnet_head_lock_shared();
1969 if ((ifscope > (unsigned)if_index) || (ifscope != IFSCOPE_NONE &&
1970 (ifp = ifindex2ifnet[ifscope]) == NULL)) {
1971 ifnet_head_done();
1972 return (ENXIO);
1973 }
1974 ifnet_head_done();
1975
1976 VERIFY(ifp != NULL || ifscope == IFSCOPE_NONE);
1977
1978 /*
1979 * A zero interface scope value indicates an "unbind".
1980 * Otherwise, take in whatever value the app desires;
1981 * the app may already know the scope (or force itself
1982 * to such a scope) ahead of time before the interface
1983 * gets attached. It doesn't matter either way; any
1984 * route lookup from this point on will require an
1985 * exact match for the embedded interface scope.
1986 */
1987 inp->inp_boundifp = ifp;
1988 if (inp->inp_boundifp == NULL)
1989 inp->inp_flags &= ~INP_BOUND_IF;
1990 else
1991 inp->inp_flags |= INP_BOUND_IF;
1992
1993 /* Blow away any cached route in the PCB */
1994 if (inp->inp_route.ro_rt != NULL) {
1995 rtfree(inp->inp_route.ro_rt);
1996 inp->inp_route.ro_rt = NULL;
1997 }
1998
1999 return (0);
2000 }
2001
2002 /*
2003 * Handler for setting IP_NO_IFT_CELLULAR/IPV6_NO_IFT_CELLULAR socket option.
2004 */
2005 int
2006 inp_nocellular(struct inpcb *inp, unsigned int val)
2007 {
2008 if (val) {
2009 inp->inp_flags |= INP_NO_IFT_CELLULAR;
2010 } else if (inp->inp_flags & INP_NO_IFT_CELLULAR) {
2011 /* once set, it cannot be unset */
2012 return (EINVAL);
2013 }
2014
2015 /* Blow away any cached route in the PCB */
2016 if (inp->inp_route.ro_rt != NULL) {
2017 rtfree(inp->inp_route.ro_rt);
2018 inp->inp_route.ro_rt = NULL;
2019 }
2020
2021 return (0);
2022 }
2023
2024 /*
2025 * Calculate flow hash for an inp, used by an interface to identify a
2026 * flow. When an interface provides flow control advisory, this flow
2027 * hash is used as an identifier.
2028 */
2029 u_int32_t
2030 inp_calc_flowhash(struct inpcb *inp)
2031 {
2032 struct inp_flowhash_key fh __attribute__((aligned(8)));
2033 u_int32_t flowhash = 0;
2034
2035 if (inp_hash_seed == 0)
2036 inp_hash_seed = RandomULong();
2037
2038 bzero(&fh, sizeof (fh));
2039
2040 bcopy(&inp->inp_dependladdr, &fh.infh_laddr, sizeof (fh.infh_laddr));
2041 bcopy(&inp->inp_dependfaddr, &fh.infh_faddr, sizeof (fh.infh_faddr));
2042
2043 fh.infh_lport = inp->inp_lport;
2044 fh.infh_fport = inp->inp_fport;
2045 fh.infh_af = (inp->inp_vflag & INP_IPV6) ? AF_INET6 : AF_INET;
2046 fh.infh_proto = inp->inp_ip_p;
2047 fh.infh_rand1 = RandomULong();
2048 fh.infh_rand2 = RandomULong();
2049
2050 try_again:
2051 flowhash = net_flowhash(&fh, sizeof (fh), inp_hash_seed);
2052 if (flowhash == 0) {
2053 /* try to get a non-zero flowhash */
2054 inp_hash_seed = RandomULong();
2055 goto try_again;
2056 }
2057
2058 return flowhash;
2059 }
2060
2061 /*
2062 * Function to compare inp_fc_entries in inp flow control tree
2063 */
2064 static inline int
2065 infc_cmp(const struct inp_fc_entry *fc1, const struct inp_fc_entry *fc2)
2066 {
2067 return (fc1->infc_flowhash - fc2->infc_flowhash);
2068 }
2069
2070 int
2071 inp_fc_addinp(struct inpcb *inp)
2072 {
2073 struct inp_fc_entry keyfc, *infc;
2074 u_int32_t flowhash = inp->inp_flowhash;
2075
2076 keyfc.infc_flowhash = flowhash;
2077
2078 lck_mtx_lock_spin(&inp_fc_lck);
2079 infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc);
2080 if (infc != NULL && infc->infc_inp == inp) {
2081 /* Entry is already in inp_fc_tree, return */
2082 lck_mtx_unlock(&inp_fc_lck);
2083 return (1);
2084 }
2085
2086 if (infc != NULL) {
2087 /*
2088 * There is a different fc entry with the same
2089 * flow hash but different inp pointer. There
2090 * can be a collision on flow hash but the
2091 * probability is low. Let's just avoid
2092 * adding a second one when there is a collision
2093 */
2094 lck_mtx_unlock(&inp_fc_lck);
2095 return (0);
2096 }
2097
2098 /* become regular mutex */
2099 lck_mtx_convert_spin(&inp_fc_lck);
2100
2101 infc = zalloc_noblock(inp_fcezone);
2102 if (infc == NULL) {
2103 /* memory allocation failed */
2104 lck_mtx_unlock(&inp_fc_lck);
2105 return (0);
2106 }
2107 bzero(infc, sizeof (*infc));
2108
2109 infc->infc_flowhash = flowhash;
2110 infc->infc_inp = inp;
2111
2112 RB_INSERT(inp_fc_tree, &inp_fc_tree, infc);
2113 lck_mtx_unlock(&inp_fc_lck);
2114 return (1);
2115 }
2116
2117 struct inp_fc_entry*
2118 inp_fc_getinp(u_int32_t flowhash)
2119 {
2120 struct inp_fc_entry keyfc, *infc;
2121
2122 keyfc.infc_flowhash = flowhash;
2123
2124 lck_mtx_lock_spin(&inp_fc_lck);
2125 infc = RB_FIND(inp_fc_tree, &inp_fc_tree, &keyfc);
2126 if (infc == NULL) {
2127 /* inp is not present, return */
2128 lck_mtx_unlock(&inp_fc_lck);
2129 return (NULL);
2130 }
2131
2132 RB_REMOVE(inp_fc_tree, &inp_fc_tree, infc);
2133
2134 if (in_pcb_checkstate(infc->infc_inp, WNT_ACQUIRE, 0) ==
2135 WNT_STOPUSING) {
2136 /* become regular mutex */
2137 lck_mtx_convert_spin(&inp_fc_lck);
2138
2139 /*
2140 * This inp is going away, just don't process it.
2141 */
2142 inp_fc_entry_free(infc);
2143 infc = NULL;
2144 }
2145 lck_mtx_unlock(&inp_fc_lck);
2146
2147 return (infc);
2148 }
2149
2150 void
2151 inp_fc_entry_free(struct inp_fc_entry *infc)
2152 {
2153 zfree(inp_fcezone, infc);
2154 }
2155
2156 void
2157 inp_fc_feedback(struct inpcb *inp)
2158 {
2159 struct socket *so = inp->inp_socket;
2160
2161 /* we already hold a want_cnt on this inp, socket can't be null */
2162 VERIFY (so != NULL);
2163 socket_lock(so, 1);
2164
2165 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2166 socket_unlock(so, 1);
2167 return;
2168 }
2169
2170 /*
2171 * Return if the connection is not in flow-controlled state.
2172 * This can happen if the connection experienced
2173 * loss while it was in flow controlled state
2174 */
2175 if (!INP_WAIT_FOR_IF_FEEDBACK(inp)) {
2176 socket_unlock(so, 1);
2177 return;
2178 }
2179 inp_reset_fc_state(inp);
2180
2181 if (so->so_proto->pr_type == SOCK_STREAM)
2182 inp_fc_unthrottle_tcp(inp);
2183
2184 socket_unlock(so, 1);
2185 }
2186
2187 void
2188 inp_reset_fc_state(struct inpcb *inp)
2189 {
2190 struct socket *so = inp->inp_socket;
2191 int suspended = (INP_IS_FLOW_SUSPENDED(inp)) ? 1 : 0;
2192 int needwakeup = (INP_WAIT_FOR_IF_FEEDBACK(inp)) ? 1 : 0;
2193
2194 inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
2195
2196 if (suspended) {
2197 so->so_flags &= ~(SOF_SUSPENDED);
2198 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_RESUME));
2199 }
2200
2201 if (inp->inp_sndinprog_cnt > 0)
2202 inp->inp_flags |= INP_FC_FEEDBACK;
2203
2204 /* Give a write wakeup to unblock the socket */
2205 if (needwakeup)
2206 sowwakeup(so);
2207 }
2208
2209 int
2210 inp_set_fc_state(struct inpcb *inp, int advcode)
2211 {
2212 /*
2213 * If there was a feedback from the interface when
2214 * send operation was in progress, we should ignore
2215 * this flow advisory to avoid a race between setting
2216 * flow controlled state and receiving feedback from
2217 * the interface
2218 */
2219 if (inp->inp_flags & INP_FC_FEEDBACK)
2220 return(0);
2221
2222 inp->inp_flags &= ~(INP_FLOW_CONTROLLED | INP_FLOW_SUSPENDED);
2223 if (inp_fc_addinp(inp)) {
2224 switch (advcode) {
2225 case FADV_FLOW_CONTROLLED:
2226 inp->inp_flags |= INP_FLOW_CONTROLLED;
2227 break;
2228 case FADV_SUSPENDED:
2229 inp->inp_flags |= INP_FLOW_SUSPENDED;
2230 soevent(inp->inp_socket,
2231 (SO_FILT_HINT_LOCKED | SO_FILT_HINT_SUSPEND));
2232
2233 /* Record the fact that suspend event was sent */
2234 inp->inp_socket->so_flags |= SOF_SUSPENDED;
2235 break;
2236 }
2237 }
2238 return(1);
2239 }
2240
2241 /*
2242 * Handler for SO_FLUSH socket option.
2243 */
2244 int
2245 inp_flush(struct inpcb *inp, int optval)
2246 {
2247 u_int32_t flowhash = inp->inp_flowhash;
2248 struct rtentry *rt;
2249
2250 /* Either all classes or one of the valid ones */
2251 if (optval != SO_TC_ALL && !SO_VALID_TC(optval))
2252 return (EINVAL);
2253
2254 /* We need a flow hash for identification */
2255 if (flowhash == 0)
2256 return (0);
2257
2258 /* We need a cached route for the interface */
2259 if ((rt = inp->inp_route.ro_rt) != NULL) {
2260 struct ifnet *ifp = rt->rt_ifp;
2261 if_qflush_sc(ifp, so_tc2msc(optval), flowhash, NULL, NULL, 0);
2262 }
2263
2264 return (0);
2265 }
2266
2267 /*
2268 * Clear the INP_INADDR_ANY flag (special case for PPP only)
2269 */
2270 void inp_clear_INP_INADDR_ANY(struct socket *so)
2271 {
2272 struct inpcb *inp = NULL;
2273
2274 socket_lock(so, 1);
2275 inp = sotoinpcb(so);
2276 if (inp) {
2277 inp->inp_flags &= ~INP_INADDR_ANY;
2278 }
2279 socket_unlock(so, 1);
2280 }
2281