]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/in_pcb.c
10efe37f0f35a21a4e241e5df73cacf827ec86ba
[apple/xnu.git] / bsd / netinet / in_pcb.c
1 /*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * Copyright (c) 1982, 1986, 1991, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/in_pcb.c,v 1.59.2.17 2001/08/13 16:26:17 ume Exp $
62 */
63
64 #include <sys/param.h>
65 #include <sys/systm.h>
66 #include <sys/malloc.h>
67 #include <sys/mbuf.h>
68 #include <sys/domain.h>
69 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/proc.h>
73 #ifndef __APPLE__
74 #include <sys/jail.h>
75 #endif
76 #include <sys/kernel.h>
77 #include <sys/sysctl.h>
78 #include <libkern/OSAtomic.h>
79
80 #include <machine/limits.h>
81
82 #ifdef __APPLE__
83 #include <kern/zalloc.h>
84 #endif
85
86 #include <net/if.h>
87 #include <net/if_types.h>
88 #include <net/route.h>
89
90 #include <netinet/in.h>
91 #include <netinet/in_pcb.h>
92 #include <netinet/in_var.h>
93 #include <netinet/ip_var.h>
94 #if INET6
95 #include <netinet/ip6.h>
96 #include <netinet6/ip6_var.h>
97 #endif /* INET6 */
98
99 #include "faith.h"
100
101 #if IPSEC
102 #include <netinet6/ipsec.h>
103 #include <netkey/key.h>
104 #endif /* IPSEC */
105
106 #include <sys/kdebug.h>
107 #include <sys/random.h>
108
109 #if IPSEC
110 extern int ipsec_bypass;
111 #endif
112
113 #define DBG_FNC_PCB_LOOKUP NETDBG_CODE(DBG_NETTCP, (6 << 8))
114 #define DBG_FNC_PCB_HLOOKUP NETDBG_CODE(DBG_NETTCP, ((6 << 8) | 1))
115
116 struct in_addr zeroin_addr;
117
118 /*
119 * These configure the range of local port addresses assigned to
120 * "unspecified" outgoing connections/packets/whatever.
121 */
122 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
123 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
124 #ifndef __APPLE__
125 int ipport_firstauto = IPPORT_RESERVED; /* 1024 */
126 int ipport_lastauto = IPPORT_USERRESERVED; /* 5000 */
127 #else
128 int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
129 int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
130 #endif
131 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
132 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
133
134 #define RANGECHK(var, min, max) \
135 if ((var) < (min)) { (var) = (min); } \
136 else if ((var) > (max)) { (var) = (max); }
137
138 static int
139 sysctl_net_ipport_check SYSCTL_HANDLER_ARGS
140 {
141 #pragma unused(arg1, arg2)
142 int error = sysctl_handle_int(oidp,
143 oidp->oid_arg1, oidp->oid_arg2, req);
144 if (!error) {
145 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
146 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
147 RANGECHK(ipport_firstauto, IPPORT_RESERVED, USHRT_MAX);
148 RANGECHK(ipport_lastauto, IPPORT_RESERVED, USHRT_MAX);
149 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, USHRT_MAX);
150 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, USHRT_MAX);
151 }
152 return error;
153 }
154
155 #undef RANGECHK
156
157 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IP Ports");
158
159 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
160 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
161 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
162 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
163 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
164 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
165 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
166 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
167 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
168 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
170 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
171
172 extern int udp_use_randomport;
173 extern int tcp_use_randomport;
174
175 /*
176 * in_pcb.c: manage the Protocol Control Blocks.
177 *
178 * NOTE: It is assumed that most of these functions will be called at
179 * splnet(). XXX - There are, unfortunately, a few exceptions to this
180 * rule that should be fixed.
181 */
182
183 /*
184 * Allocate a PCB and associate it with the socket.
185 *
186 * Returns: 0 Success
187 * ENOBUFS
188 * ENOMEM
189 * ipsec_init_policy:??? [IPSEC]
190 */
191 int
192 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo, __unused struct proc *p)
193 {
194 struct inpcb *inp;
195 caddr_t temp;
196 #if IPSEC
197 #ifndef __APPLE__
198 int error;
199 #endif
200 #endif
201 #if CONFIG_MACF_NET
202 int mac_error;
203 #endif
204
205 if (so->cached_in_sock_layer == 0) {
206 #if TEMPDEBUG
207 printf("PCBALLOC calling zalloc for socket %x\n", so);
208 #endif
209 inp = (struct inpcb *) zalloc(pcbinfo->ipi_zone);
210 if (inp == NULL)
211 return (ENOBUFS);
212 bzero((caddr_t)inp, sizeof(*inp));
213 }
214 else {
215 #if TEMPDEBUG
216 printf("PCBALLOC reusing PCB for socket %x\n", so);
217 #endif
218 inp = (struct inpcb *) so->so_saved_pcb;
219 temp = inp->inp_saved_ppcb;
220 bzero((caddr_t) inp, sizeof(*inp));
221 inp->inp_saved_ppcb = temp;
222 }
223
224 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
225 inp->inp_pcbinfo = pcbinfo;
226 inp->inp_socket = so;
227 #if CONFIG_MACF_NET
228 mac_error = mac_inpcb_label_init(inp, M_WAITOK);
229 if (mac_error != 0) {
230 if (so->cached_in_sock_layer == 0)
231 zfree(pcbinfo->ipi_zone, inp);
232 return (mac_error);
233 }
234 mac_inpcb_label_associate(so, inp);
235 #endif
236 so->so_pcb = (caddr_t)inp;
237
238 if (so->so_proto->pr_flags & PR_PCBLOCK) {
239 inp->inpcb_mtx = lck_mtx_alloc_init(pcbinfo->mtx_grp, pcbinfo->mtx_attr);
240 if (inp->inpcb_mtx == NULL) {
241 printf("in_pcballoc: can't alloc mutex! so=%p\n", so);
242 return(ENOMEM);
243 }
244 }
245
246 #if IPSEC
247 #ifndef __APPLE__
248 if (ipsec_bypass == 0) {
249 error = ipsec_init_policy(so, &inp->inp_sp);
250 if (error != 0) {
251 zfree(pcbinfo->ipi_zone, inp);
252 return error;
253 }
254 }
255 #endif
256 #endif /*IPSEC*/
257 #if INET6
258 if (INP_SOCKAF(so) == AF_INET6 && !ip6_mapped_addr_on)
259 inp->inp_flags |= IN6P_IPV6_V6ONLY;
260 #endif
261
262 #if INET6
263 if (ip6_auto_flowlabel)
264 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
265 #endif
266 lck_rw_lock_exclusive(pcbinfo->mtx);
267 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
268 LIST_INSERT_HEAD(pcbinfo->listhead, inp, inp_list);
269 pcbinfo->ipi_count++;
270 lck_rw_done(pcbinfo->mtx);
271 return (0);
272 }
273
274
275 /*
276 in_pcblookup_local_and_cleanup does everything
277 in_pcblookup_local does but it checks for a socket
278 that's going away. Since we know that the lock is
279 held read+write when this funciton is called, we
280 can safely dispose of this socket like the slow
281 timer would usually do and return NULL. This is
282 great for bind.
283 */
284 struct inpcb*
285 in_pcblookup_local_and_cleanup(
286 struct inpcbinfo *pcbinfo,
287 struct in_addr laddr,
288 u_int lport_arg,
289 int wild_okay)
290 {
291 struct inpcb *inp;
292
293 /* Perform normal lookup */
294 inp = in_pcblookup_local(pcbinfo, laddr, lport_arg, wild_okay);
295
296 /* Check if we found a match but it's waiting to be disposed */
297 if (inp && inp->inp_wantcnt == WNT_STOPUSING) {
298 struct socket *so = inp->inp_socket;
299
300 lck_mtx_lock(inp->inpcb_mtx);
301
302 if (so->so_usecount == 0) {
303 if (inp->inp_state != INPCB_STATE_DEAD)
304 in_pcbdetach(inp);
305 in_pcbdispose(inp);
306 inp = NULL;
307 }
308 else {
309 lck_mtx_unlock(inp->inpcb_mtx);
310 }
311 }
312
313 return inp;
314 }
315
316 #ifdef __APPLE_API_PRIVATE
317 static void
318 in_pcb_conflict_post_msg(u_int16_t port)
319 {
320 /*
321 * Radar 5523020 send a kernel event notification if a non-participating socket tries to bind
322 * the port a socket who has set SOF_NOTIFYCONFLICT owns.
323 */
324 struct kev_msg ev_msg;
325 struct kev_in_portinuse in_portinuse;
326
327 in_portinuse.port = ntohs(port); /* port in host order */
328 in_portinuse.req_pid = proc_selfpid();
329 ev_msg.vendor_code = KEV_VENDOR_APPLE;
330 ev_msg.kev_class = KEV_NETWORK_CLASS;
331 ev_msg.kev_subclass = KEV_INET_SUBCLASS;
332 ev_msg.event_code = KEV_INET_PORTINUSE;
333 ev_msg.dv[0].data_ptr = &in_portinuse;
334 ev_msg.dv[0].data_length = sizeof(struct kev_in_portinuse);
335 ev_msg.dv[1].data_length = 0;
336 kev_post_msg(&ev_msg);
337 }
338 #endif
339 /*
340 * Returns: 0 Success
341 * EADDRNOTAVAIL Address not available.
342 * EINVAL Invalid argument
343 * EAFNOSUPPORT Address family not supported [notdef]
344 * EACCES Permission denied
345 * EADDRINUSE Address in use
346 * EAGAIN Resource unavailable, try again
347 * proc_suser:EPERM Operation not permitted
348 */
349 int
350 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
351 {
352 struct socket *so = inp->inp_socket;
353 unsigned short *lastport;
354 struct sockaddr_in *sin;
355 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
356 u_short lport = 0, rand_port = 0;
357 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
358 int error, randomport, conflict = 0;
359
360 if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
361 return (EADDRNOTAVAIL);
362 if (inp->inp_lport || inp->inp_laddr.s_addr != INADDR_ANY)
363 return (EINVAL);
364 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
365 wild = 1;
366 socket_unlock(so, 0); /* keep reference on socket */
367 lck_rw_lock_exclusive(pcbinfo->mtx);
368 if (nam) {
369 sin = (struct sockaddr_in *)nam;
370 if (nam->sa_len != sizeof (*sin)) {
371 lck_rw_done(pcbinfo->mtx);
372 socket_lock(so, 0);
373 return (EINVAL);
374 }
375 #ifdef notdef
376 /*
377 * We should check the family, but old programs
378 * incorrectly fail to initialize it.
379 */
380 if (sin->sin_family != AF_INET) {
381 lck_rw_done(pcbinfo->mtx);
382 socket_lock(so, 0);
383 return (EAFNOSUPPORT);
384 }
385 #endif
386 lport = sin->sin_port;
387 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
388 /*
389 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
390 * allow complete duplication of binding if
391 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
392 * and a multicast address is bound on both
393 * new and duplicated sockets.
394 */
395 if (so->so_options & SO_REUSEADDR)
396 reuseport = SO_REUSEADDR|SO_REUSEPORT;
397 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
398 struct ifaddr *ifa;
399 sin->sin_port = 0; /* yech... */
400 if ((ifa = ifa_ifwithaddr((struct sockaddr *)sin)) == 0) {
401 lck_rw_done(pcbinfo->mtx);
402 socket_lock(so, 0);
403 return (EADDRNOTAVAIL);
404 }
405 else {
406 ifafree(ifa);
407 }
408 }
409 if (lport) {
410 struct inpcb *t;
411
412 /* GROSS */
413 #if !CONFIG_EMBEDDED
414 if (ntohs(lport) < IPPORT_RESERVED && proc_suser(p)) {
415 lck_rw_done(pcbinfo->mtx);
416 socket_lock(so, 0);
417 return (EACCES);
418 }
419 #endif
420 if (so->so_uid &&
421 !IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
422 t = in_pcblookup_local_and_cleanup(inp->inp_pcbinfo,
423 sin->sin_addr, lport, INPLOOKUP_WILDCARD);
424 if (t &&
425 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
426 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
427 (t->inp_socket->so_options &
428 SO_REUSEPORT) == 0) &&
429 (so->so_uid != t->inp_socket->so_uid) &&
430 ((t->inp_socket->so_flags & SOF_REUSESHAREUID) == 0)) {
431 #if INET6
432 if (ntohl(sin->sin_addr.s_addr) !=
433 INADDR_ANY ||
434 ntohl(t->inp_laddr.s_addr) !=
435 INADDR_ANY ||
436 INP_SOCKAF(so) ==
437 INP_SOCKAF(t->inp_socket))
438 #endif /* INET6 */
439 {
440 #ifdef __APPLE_API_PRIVATE
441
442 if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
443 conflict = 1;
444
445 lck_rw_done(pcbinfo->mtx);
446
447 if (conflict)
448 in_pcb_conflict_post_msg(lport);
449 #else
450 lck_rw_done(pcbinfo->mtx);
451 #endif /* __APPLE_API_PRIVATE */
452
453 socket_lock(so, 0);
454 return (EADDRINUSE);
455 }
456 }
457 }
458 t = in_pcblookup_local_and_cleanup(pcbinfo, sin->sin_addr,
459 lport, wild);
460 if (t &&
461 (reuseport & t->inp_socket->so_options) == 0) {
462 #if INET6
463 if (ip6_mapped_addr_on == 0 ||
464 ntohl(sin->sin_addr.s_addr) !=
465 INADDR_ANY ||
466 ntohl(t->inp_laddr.s_addr) !=
467 INADDR_ANY ||
468 INP_SOCKAF(so) ==
469 INP_SOCKAF(t->inp_socket))
470 #endif /* INET6 */
471 {
472 #ifdef __APPLE_API_PRIVATE
473
474 if ((t->inp_socket->so_flags & SOF_NOTIFYCONFLICT) && ((so->so_flags & SOF_NOTIFYCONFLICT) == 0))
475 conflict = 1;
476
477 lck_rw_done(pcbinfo->mtx);
478
479 if (conflict)
480 in_pcb_conflict_post_msg(lport);
481 #else
482 lck_rw_done(pcbinfo->mtx);
483 #endif /* __APPLE_API_PRIVATE */
484 socket_lock(so, 0);
485 return (EADDRINUSE);
486 }
487 }
488 }
489 inp->inp_laddr = sin->sin_addr;
490 }
491 if (lport == 0) {
492 u_short first, last;
493 int count;
494
495 randomport = (so->so_flags & SOF_BINDRANDOMPORT) ||
496 (so->so_type == SOCK_STREAM ? tcp_use_randomport : udp_use_randomport);
497
498 inp->inp_flags |= INP_ANONPORT;
499
500 if (inp->inp_flags & INP_HIGHPORT) {
501 first = ipport_hifirstauto; /* sysctl */
502 last = ipport_hilastauto;
503 lastport = &pcbinfo->lasthi;
504 } else if (inp->inp_flags & INP_LOWPORT) {
505 if ((error = proc_suser(p)) != 0) {
506 lck_rw_done(pcbinfo->mtx);
507 socket_lock(so, 0);
508 return error;
509 }
510 first = ipport_lowfirstauto; /* 1023 */
511 last = ipport_lowlastauto; /* 600 */
512 lastport = &pcbinfo->lastlow;
513 } else {
514 first = ipport_firstauto; /* sysctl */
515 last = ipport_lastauto;
516 lastport = &pcbinfo->lastport;
517 }
518 /* No point in randomizing if only one port is available */
519
520 if (first == last)
521 randomport = 0;
522 /*
523 * Simple check to ensure all ports are not used up causing
524 * a deadlock here.
525 *
526 * We split the two cases (up and down) so that the direction
527 * is not being tested on each round of the loop.
528 */
529 if (first > last) {
530 /*
531 * counting down
532 */
533 if (randomport) {
534 read_random(&rand_port, sizeof(rand_port));
535 *lastport = first - (rand_port % (first - last));
536 }
537 count = first - last;
538
539 do {
540 if (count-- < 0) { /* completely used? */
541 lck_rw_done(pcbinfo->mtx);
542 socket_lock(so, 0);
543 inp->inp_laddr.s_addr = INADDR_ANY;
544 return (EADDRNOTAVAIL);
545 }
546 --*lastport;
547 if (*lastport > first || *lastport < last)
548 *lastport = first;
549 lport = htons(*lastport);
550 } while (in_pcblookup_local_and_cleanup(pcbinfo,
551 inp->inp_laddr, lport, wild));
552 } else {
553 /*
554 * counting up
555 */
556 if (randomport) {
557 read_random(&rand_port, sizeof(rand_port));
558 *lastport = first + (rand_port % (first - last));
559 }
560 count = last - first;
561
562 do {
563 if (count-- < 0) { /* completely used? */
564 lck_rw_done(pcbinfo->mtx);
565 socket_lock(so, 0);
566 inp->inp_laddr.s_addr = INADDR_ANY;
567 return (EADDRNOTAVAIL);
568 }
569 ++*lastport;
570 if (*lastport < first || *lastport > last)
571 *lastport = first;
572 lport = htons(*lastport);
573 } while (in_pcblookup_local_and_cleanup(pcbinfo,
574 inp->inp_laddr, lport, wild));
575 }
576 }
577 socket_lock(so, 0);
578 inp->inp_lport = lport;
579 if (in_pcbinshash(inp, 1) != 0) {
580 inp->inp_laddr.s_addr = INADDR_ANY;
581 inp->inp_lport = 0;
582 lck_rw_done(pcbinfo->mtx);
583 return (EAGAIN);
584 }
585 lck_rw_done(pcbinfo->mtx);
586 sflt_notify(so, sock_evt_bound, NULL);
587 return (0);
588 }
589
590 /*
591 * Transform old in_pcbconnect() into an inner subroutine for new
592 * in_pcbconnect(): Do some validity-checking on the remote
593 * address (in mbuf 'nam') and then determine local host address
594 * (i.e., which interface) to use to access that remote host.
595 *
596 * This preserves definition of in_pcbconnect(), while supporting a
597 * slightly different version for T/TCP. (This is more than
598 * a bit of a kludge, but cleaning up the internal interfaces would
599 * have forced minor changes in every protocol).
600 *
601 * Returns: 0 Success
602 * EINVAL Invalid argument
603 * EAFNOSUPPORT Address family not supported
604 * EADDRNOTAVAIL Address not available
605 */
606 int
607 in_pcbladdr(struct inpcb *inp, struct sockaddr *nam,
608 struct sockaddr_in **plocal_sin)
609 {
610 struct in_ifaddr *ia;
611 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
612
613 if (nam->sa_len != sizeof (*sin))
614 return (EINVAL);
615 if (sin->sin_family != AF_INET)
616 return (EAFNOSUPPORT);
617 if (sin->sin_port == 0)
618 return (EADDRNOTAVAIL);
619
620 lck_rw_lock_shared(in_ifaddr_rwlock);
621 if (!TAILQ_EMPTY(&in_ifaddrhead)) {
622 /*
623 * If the destination address is INADDR_ANY,
624 * use the primary local address.
625 * If the supplied address is INADDR_BROADCAST,
626 * and the primary interface supports broadcast,
627 * choose the broadcast address for that interface.
628 */
629 #define satosin(sa) ((struct sockaddr_in *)(sa))
630 #define sintosa(sin) ((struct sockaddr *)(sin))
631 #define ifatoia(ifa) ((struct in_ifaddr *)(ifa))
632 if (sin->sin_addr.s_addr == INADDR_ANY)
633 sin->sin_addr = IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr;
634 else if (sin->sin_addr.s_addr == (u_int32_t)INADDR_BROADCAST &&
635 (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST))
636 sin->sin_addr = satosin(&TAILQ_FIRST(&in_ifaddrhead)->ia_broadaddr)->sin_addr;
637 }
638 lck_rw_done(in_ifaddr_rwlock);
639
640 if (inp->inp_laddr.s_addr == INADDR_ANY) {
641 struct route *ro;
642 unsigned int ifscope;
643
644 ia = (struct in_ifaddr *)0;
645 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
646 inp->inp_boundif : IFSCOPE_NONE;
647 /*
648 * If route is known or can be allocated now,
649 * our src addr is taken from the i/f, else punt.
650 * Note that we should check the address family of the cached
651 * destination, in case of sharing the cache with IPv6.
652 */
653 ro = &inp->inp_route;
654 if (ro->ro_rt != NULL)
655 RT_LOCK_SPIN(ro->ro_rt);
656 if (ro->ro_rt && (ro->ro_dst.sa_family != AF_INET ||
657 satosin(&ro->ro_dst)->sin_addr.s_addr !=
658 sin->sin_addr.s_addr ||
659 inp->inp_socket->so_options & SO_DONTROUTE ||
660 ro->ro_rt->generation_id != route_generation)) {
661 RT_UNLOCK(ro->ro_rt);
662 rtfree(ro->ro_rt);
663 ro->ro_rt = NULL;
664 }
665 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
666 (ro->ro_rt == NULL || ro->ro_rt->rt_ifp == NULL)) {
667 if (ro->ro_rt != NULL)
668 RT_UNLOCK(ro->ro_rt);
669 /* No route yet, so try to acquire one */
670 bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
671 ro->ro_dst.sa_family = AF_INET;
672 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
673 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
674 sin->sin_addr;
675 rtalloc_scoped_ign(ro, 0, ifscope);
676 if (ro->ro_rt != NULL)
677 RT_LOCK_SPIN(ro->ro_rt);
678 }
679 /*
680 * If we found a route, use the address
681 * corresponding to the outgoing interface
682 * unless it is the loopback (in case a route
683 * to our address on another net goes to loopback).
684 */
685 if (ro->ro_rt != NULL) {
686 RT_LOCK_ASSERT_HELD(ro->ro_rt);
687 if (!(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) {
688 ia = ifatoia(ro->ro_rt->rt_ifa);
689 if (ia)
690 ifaref(&ia->ia_ifa);
691 }
692 RT_UNLOCK(ro->ro_rt);
693 }
694 if (ia == 0) {
695 u_short fport = sin->sin_port;
696
697 sin->sin_port = 0;
698 ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin)));
699 if (ia == 0) {
700 ia = ifatoia(ifa_ifwithnet_scoped(sintosa(sin),
701 ifscope));
702 }
703 sin->sin_port = fport;
704 if (ia == 0) {
705 lck_rw_lock_shared(in_ifaddr_rwlock);
706 ia = TAILQ_FIRST(&in_ifaddrhead);
707 if (ia)
708 ifaref(&ia->ia_ifa);
709 lck_rw_done(in_ifaddr_rwlock);
710 }
711 if (ia == 0)
712 return (EADDRNOTAVAIL);
713 }
714 /*
715 * If the destination address is multicast and an outgoing
716 * interface has been set as a multicast option, use the
717 * address of that interface as our source address.
718 */
719 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
720 inp->inp_moptions != NULL) {
721 struct ip_moptions *imo;
722 struct ifnet *ifp;
723
724 imo = inp->inp_moptions;
725 if (imo->imo_multicast_ifp != NULL && (ia == NULL ||
726 ia->ia_ifp != imo->imo_multicast_ifp)) {
727 ifp = imo->imo_multicast_ifp;
728 if (ia)
729 ifafree(&ia->ia_ifa);
730 lck_rw_lock_shared(in_ifaddr_rwlock);
731 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) {
732 if (ia->ia_ifp == ifp)
733 break;
734 }
735 if (ia)
736 ifaref(&ia->ia_ifa);
737 lck_rw_done(in_ifaddr_rwlock);
738 if (ia == 0)
739 return (EADDRNOTAVAIL);
740 }
741 }
742 /*
743 * Don't do pcblookup call here; return interface in plocal_sin
744 * and exit to caller, that will do the lookup.
745 */
746 *plocal_sin = &ia->ia_addr;
747 ifafree(&ia->ia_ifa);
748 }
749 return(0);
750 }
751
752 /*
753 * Outer subroutine:
754 * Connect from a socket to a specified address.
755 * Both address and port must be specified in argument sin.
756 * If don't have a local address for this socket yet,
757 * then pick one.
758 */
759 int
760 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
761 {
762 struct sockaddr_in *ifaddr;
763 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
764 struct inpcb *pcb;
765 int error;
766
767 /*
768 * Call inner routine, to assign local interface address.
769 */
770 if ((error = in_pcbladdr(inp, nam, &ifaddr)) != 0)
771 return(error);
772
773 socket_unlock(inp->inp_socket, 0);
774 pcb = in_pcblookup_hash(inp->inp_pcbinfo, sin->sin_addr, sin->sin_port,
775 inp->inp_laddr.s_addr ? inp->inp_laddr : ifaddr->sin_addr,
776 inp->inp_lport, 0, NULL);
777 socket_lock(inp->inp_socket, 0);
778 if (pcb != NULL) {
779 in_pcb_checkstate(pcb, WNT_RELEASE, 0);
780 return (EADDRINUSE);
781 }
782 if (inp->inp_laddr.s_addr == INADDR_ANY) {
783 if (inp->inp_lport == 0) {
784 error = in_pcbbind(inp, (struct sockaddr *)0, p);
785 if (error)
786 return (error);
787 }
788 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
789 /*lock inversion issue, mostly with udp multicast packets */
790 socket_unlock(inp->inp_socket, 0);
791 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
792 socket_lock(inp->inp_socket, 0);
793 }
794 inp->inp_laddr = ifaddr->sin_addr;
795 inp->inp_flags |= INP_INADDR_ANY;
796 }
797 else {
798 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
799 /*lock inversion issue, mostly with udp multicast packets */
800 socket_unlock(inp->inp_socket, 0);
801 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
802 socket_lock(inp->inp_socket, 0);
803 }
804 }
805 inp->inp_faddr = sin->sin_addr;
806 inp->inp_fport = sin->sin_port;
807 in_pcbrehash(inp);
808 lck_rw_done(inp->inp_pcbinfo->mtx);
809 return (0);
810 }
811
812 void
813 in_pcbdisconnect(struct inpcb *inp)
814 {
815
816 inp->inp_faddr.s_addr = INADDR_ANY;
817 inp->inp_fport = 0;
818
819 if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->mtx)) {
820 /*lock inversion issue, mostly with udp multicast packets */
821 socket_unlock(inp->inp_socket, 0);
822 lck_rw_lock_exclusive(inp->inp_pcbinfo->mtx);
823 socket_lock(inp->inp_socket, 0);
824 }
825
826 in_pcbrehash(inp);
827 lck_rw_done(inp->inp_pcbinfo->mtx);
828
829 if (inp->inp_socket->so_state & SS_NOFDREF)
830 in_pcbdetach(inp);
831 }
832
833 void
834 in_pcbdetach(struct inpcb *inp)
835 {
836 struct socket *so = inp->inp_socket;
837
838 if (so->so_pcb == 0) { /* we've been called twice */
839 panic("in_pcbdetach: inp=%p so=%p proto=%d so_pcb is null!\n",
840 inp, so, so->so_proto->pr_protocol);
841 }
842
843 #if IPSEC
844 if (ipsec_bypass == 0) {
845 ipsec4_delete_pcbpolicy(inp);
846 }
847 #endif /*IPSEC*/
848
849 /* mark socket state as dead */
850 if (in_pcb_checkstate(inp, WNT_STOPUSING, 1) != WNT_STOPUSING)
851 panic("in_pcbdetach so=%p prot=%x couldn't set to STOPUSING\n", so, so->so_proto->pr_protocol);
852
853 #if TEMPDEBUG
854 if (so->cached_in_sock_layer)
855 printf("in_pcbdetach for cached socket %x flags=%x\n", so, so->so_flags);
856 else
857 printf("in_pcbdetach for allocated socket %x flags=%x\n", so, so->so_flags);
858 #endif
859 if ((so->so_flags & SOF_PCBCLEARING) == 0) {
860 struct rtentry *rt;
861
862 inp->inp_vflag = 0;
863 if (inp->inp_options)
864 (void)m_free(inp->inp_options);
865 if ((rt = inp->inp_route.ro_rt) != NULL) {
866 inp->inp_route.ro_rt = NULL;
867 rtfree(rt);
868 }
869 ip_freemoptions(inp->inp_moptions);
870 inp->inp_moptions = NULL;
871 sofreelastref(so, 0);
872 inp->inp_state = INPCB_STATE_DEAD;
873 so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
874 }
875 }
876
877
878 void
879 in_pcbdispose(struct inpcb *inp)
880 {
881 struct socket *so = inp->inp_socket;
882 struct inpcbinfo *ipi = inp->inp_pcbinfo;
883
884 #if TEMPDEBUG
885 if (inp->inp_state != INPCB_STATE_DEAD) {
886 printf("in_pcbdispose: not dead yet? so=%p\n", so);
887 }
888 #endif
889
890 if (so && so->so_usecount != 0)
891 panic("in_pcbdispose: use count=%x so=%p\n", so->so_usecount, so);
892
893 lck_rw_assert(ipi->mtx, LCK_RW_ASSERT_EXCLUSIVE);
894
895 inp->inp_gencnt = ++ipi->ipi_gencnt;
896 /*### access ipi in in_pcbremlists */
897 in_pcbremlists(inp);
898
899 if (so) {
900 if (so->so_proto->pr_flags & PR_PCBLOCK) {
901 sofreelastref(so, 0);
902 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
903 #if TEMPDEBUG
904 printf("in_pcbdispose sb not cleaned up so=%p rc_cci=%x snd_cc=%x\n",
905 so, so->so_rcv.sb_cc, so->so_snd.sb_cc);
906 #endif
907 sbrelease(&so->so_rcv);
908 sbrelease(&so->so_snd);
909 }
910 if (so->so_head != NULL)
911 panic("in_pcbdispose, so=%p head still exist\n", so);
912 lck_mtx_unlock(inp->inpcb_mtx);
913 lck_mtx_free(inp->inpcb_mtx, ipi->mtx_grp);
914 }
915 so->so_flags |= SOF_PCBCLEARING; /* makes sure we're not called twice from so_close */
916 so->so_saved_pcb = (caddr_t) inp;
917 so->so_pcb = 0;
918 inp->inp_socket = 0;
919 #if CONFIG_MACF_NET
920 mac_inpcb_label_destroy(inp);
921 #endif
922 /*
923 * In case there a route cached after a detach (possible
924 * in the tcp case), make sure that it is freed before
925 * we deallocate the structure.
926 */
927 if (inp->inp_route.ro_rt != NULL) {
928 rtfree(inp->inp_route.ro_rt);
929 inp->inp_route.ro_rt = NULL;
930 }
931 if (so->cached_in_sock_layer == 0) {
932 zfree(ipi->ipi_zone, inp);
933 }
934 sodealloc(so);
935 }
936 #if TEMPDEBUG
937 else
938 printf("in_pcbdispose: no socket for inp=%p\n", inp);
939 #endif
940 }
941
942 /*
943 * The calling convention of in_setsockaddr() and in_setpeeraddr() was
944 * modified to match the pru_sockaddr() and pru_peeraddr() entry points
945 * in struct pr_usrreqs, so that protocols can just reference then directly
946 * without the need for a wrapper function. The socket must have a valid
947 * (i.e., non-nil) PCB, but it should be impossible to get an invalid one
948 * except through a kernel programming error, so it is acceptable to panic
949 * (or in this case trap) if the PCB is invalid. (Actually, we don't trap
950 * because there actually /is/ a programming error somewhere... XXX)
951 *
952 * Returns: 0 Success
953 * ENOBUFS No buffer space available
954 * ECONNRESET Connection reset
955 */
956 int
957 in_setsockaddr(struct socket *so, struct sockaddr **nam)
958 {
959 struct inpcb *inp;
960 struct sockaddr_in *sin;
961
962 /*
963 * Do the malloc first in case it blocks.
964 */
965 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
966 if (sin == NULL)
967 return ENOBUFS;
968 bzero(sin, sizeof *sin);
969 sin->sin_family = AF_INET;
970 sin->sin_len = sizeof(*sin);
971
972 inp = sotoinpcb(so);
973 if (!inp) {
974 FREE(sin, M_SONAME);
975 return ECONNRESET;
976 }
977 sin->sin_port = inp->inp_lport;
978 sin->sin_addr = inp->inp_laddr;
979
980 *nam = (struct sockaddr *)sin;
981 return 0;
982 }
983
984 int
985 in_setpeeraddr(struct socket *so, struct sockaddr **nam)
986 {
987 struct inpcb *inp;
988 struct sockaddr_in *sin;
989
990 /*
991 * Do the malloc first in case it blocks.
992 */
993 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_WAITOK);
994 if (sin == NULL)
995 return ENOBUFS;
996 bzero((caddr_t)sin, sizeof (*sin));
997 sin->sin_family = AF_INET;
998 sin->sin_len = sizeof(*sin);
999
1000 inp = sotoinpcb(so);
1001 if (!inp) {
1002 FREE(sin, M_SONAME);
1003 return ECONNRESET;
1004 }
1005 sin->sin_port = inp->inp_fport;
1006 sin->sin_addr = inp->inp_faddr;
1007
1008 *nam = (struct sockaddr *)sin;
1009 return 0;
1010 }
1011
1012 void
1013 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1014 int errno, void (*notify)(struct inpcb *, int))
1015 {
1016 struct inpcb *inp;
1017
1018 lck_rw_lock_shared(pcbinfo->mtx);
1019
1020 LIST_FOREACH(inp, pcbinfo->listhead, inp_list) {
1021 #if INET6
1022 if ((inp->inp_vflag & INP_IPV4) == 0)
1023 continue;
1024 #endif
1025 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1026 inp->inp_socket == NULL)
1027 continue;
1028 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
1029 continue;
1030 socket_lock(inp->inp_socket, 1);
1031 (*notify)(inp, errno);
1032 (void)in_pcb_checkstate(inp, WNT_RELEASE, 1);
1033 socket_unlock(inp->inp_socket, 1);
1034 }
1035 lck_rw_done(pcbinfo->mtx);
1036 }
1037
1038 /*
1039 * Check for alternatives when higher level complains
1040 * about service problems. For now, invalidate cached
1041 * routing information. If the route was created dynamically
1042 * (by a redirect), time to try a default gateway again.
1043 */
1044 void
1045 in_losing(struct inpcb *inp)
1046 {
1047 struct rtentry *rt;
1048 struct rt_addrinfo info;
1049
1050 if ((rt = inp->inp_route.ro_rt) != NULL) {
1051 struct in_ifaddr *ia;
1052
1053 bzero((caddr_t)&info, sizeof(info));
1054 RT_LOCK(rt);
1055 info.rti_info[RTAX_DST] =
1056 (struct sockaddr *)&inp->inp_route.ro_dst;
1057 info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1058 info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1059 rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
1060 if (rt->rt_flags & RTF_DYNAMIC) {
1061 /*
1062 * Prevent another thread from modifying rt_key,
1063 * rt_gateway via rt_setgate() after rt_lock is
1064 * dropped by marking the route as defunct.
1065 */
1066 rt->rt_flags |= RTF_CONDEMNED;
1067 RT_UNLOCK(rt);
1068 (void) rtrequest(RTM_DELETE, rt_key(rt),
1069 rt->rt_gateway, rt_mask(rt), rt->rt_flags,
1070 (struct rtentry **)0);
1071 } else {
1072 RT_UNLOCK(rt);
1073 }
1074 /* if the address is gone keep the old route in the pcb */
1075 if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) != NULL) {
1076 inp->inp_route.ro_rt = NULL;
1077 rtfree(rt);
1078 ifafree(&ia->ia_ifa);
1079 }
1080 /*
1081 * A new route can be allocated
1082 * the next time output is attempted.
1083 */
1084 }
1085 }
1086
1087 /*
1088 * After a routing change, flush old routing
1089 * and allocate a (hopefully) better one.
1090 */
1091 void
1092 in_rtchange(struct inpcb *inp, __unused int errno)
1093 {
1094 struct rtentry *rt;
1095
1096 if ((rt = inp->inp_route.ro_rt) != NULL) {
1097 struct in_ifaddr *ia;
1098
1099 if ((ia = ifa_foraddr(inp->inp_laddr.s_addr)) == NULL) {
1100 return; /* we can't remove the route now. not sure if still ok to use src */
1101 }
1102 ifafree(&ia->ia_ifa);
1103 rtfree(rt);
1104 inp->inp_route.ro_rt = NULL;
1105 /*
1106 * A new route can be allocated the next time
1107 * output is attempted.
1108 */
1109 }
1110 }
1111
1112 /*
1113 * Lookup a PCB based on the local address and port.
1114 */
1115 struct inpcb *
1116 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1117 unsigned int lport_arg, int wild_okay)
1118 {
1119 struct inpcb *inp;
1120 int matchwild = 3, wildcard;
1121 u_short lport = lport_arg;
1122
1123 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_START, 0,0,0,0,0);
1124
1125 if (!wild_okay) {
1126 struct inpcbhead *head;
1127 /*
1128 * Look for an unconnected (wildcard foreign addr) PCB that
1129 * matches the local address and port we're looking for.
1130 */
1131 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1132 LIST_FOREACH(inp, head, inp_hash) {
1133 #if INET6
1134 if ((inp->inp_vflag & INP_IPV4) == 0)
1135 continue;
1136 #endif
1137 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1138 inp->inp_laddr.s_addr == laddr.s_addr &&
1139 inp->inp_lport == lport) {
1140 /*
1141 * Found.
1142 */
1143 return (inp);
1144 }
1145 }
1146 /*
1147 * Not found.
1148 */
1149 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, 0,0,0,0,0);
1150 return (NULL);
1151 } else {
1152 struct inpcbporthead *porthash;
1153 struct inpcbport *phd;
1154 struct inpcb *match = NULL;
1155 /*
1156 * Best fit PCB lookup.
1157 *
1158 * First see if this local port is in use by looking on the
1159 * port hash list.
1160 */
1161 porthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(lport,
1162 pcbinfo->porthashmask)];
1163 LIST_FOREACH(phd, porthash, phd_hash) {
1164 if (phd->phd_port == lport)
1165 break;
1166 }
1167 if (phd != NULL) {
1168 /*
1169 * Port is in use by one or more PCBs. Look for best
1170 * fit.
1171 */
1172 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1173 wildcard = 0;
1174 #if INET6
1175 if ((inp->inp_vflag & INP_IPV4) == 0)
1176 continue;
1177 #endif
1178 if (inp->inp_faddr.s_addr != INADDR_ANY)
1179 wildcard++;
1180 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1181 if (laddr.s_addr == INADDR_ANY)
1182 wildcard++;
1183 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1184 continue;
1185 } else {
1186 if (laddr.s_addr != INADDR_ANY)
1187 wildcard++;
1188 }
1189 if (wildcard < matchwild) {
1190 match = inp;
1191 matchwild = wildcard;
1192 if (matchwild == 0) {
1193 break;
1194 }
1195 }
1196 }
1197 }
1198 KERNEL_DEBUG(DBG_FNC_PCB_LOOKUP | DBG_FUNC_END, match,0,0,0,0);
1199 return (match);
1200 }
1201 }
1202
1203 /*
1204 * Lookup PCB in hash list.
1205 */
1206 struct inpcb *
1207 in_pcblookup_hash(
1208 struct inpcbinfo *pcbinfo,
1209 struct in_addr faddr,
1210 u_int fport_arg,
1211 struct in_addr laddr,
1212 u_int lport_arg,
1213 int wildcard,
1214 __unused struct ifnet *ifp)
1215 {
1216 struct inpcbhead *head;
1217 struct inpcb *inp;
1218 u_short fport = fport_arg, lport = lport_arg;
1219
1220 /*
1221 * We may have found the pcb in the last lookup - check this first.
1222 */
1223
1224 lck_rw_lock_shared(pcbinfo->mtx);
1225
1226 /*
1227 * First look for an exact match.
1228 */
1229 head = &pcbinfo->hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, pcbinfo->hashmask)];
1230 LIST_FOREACH(inp, head, inp_hash) {
1231 #if INET6
1232 if ((inp->inp_vflag & INP_IPV4) == 0)
1233 continue;
1234 #endif
1235 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1236 inp->inp_laddr.s_addr == laddr.s_addr &&
1237 inp->inp_fport == fport &&
1238 inp->inp_lport == lport) {
1239 /*
1240 * Found.
1241 */
1242 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1243 lck_rw_done(pcbinfo->mtx);
1244 return (inp);
1245 }
1246 else { /* it's there but dead, say it isn't found */
1247 lck_rw_done(pcbinfo->mtx);
1248 return(NULL);
1249 }
1250 }
1251 }
1252 if (wildcard) {
1253 struct inpcb *local_wild = NULL;
1254 #if INET6
1255 struct inpcb *local_wild_mapped = NULL;
1256 #endif
1257
1258 head = &pcbinfo->hashbase[INP_PCBHASH(INADDR_ANY, lport, 0, pcbinfo->hashmask)];
1259 LIST_FOREACH(inp, head, inp_hash) {
1260 #if INET6
1261 if ((inp->inp_vflag & INP_IPV4) == 0)
1262 continue;
1263 #endif
1264 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1265 inp->inp_lport == lport) {
1266 #if defined(NFAITH) && NFAITH > 0
1267 if (ifp && ifp->if_type == IFT_FAITH &&
1268 (inp->inp_flags & INP_FAITH) == 0)
1269 continue;
1270 #endif
1271 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1272 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1273 lck_rw_done(pcbinfo->mtx);
1274 return (inp);
1275 }
1276 else { /* it's there but dead, say it isn't found */
1277 lck_rw_done(pcbinfo->mtx);
1278 return(NULL);
1279 }
1280 }
1281 else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1282 #if INET6
1283 if (INP_CHECK_SOCKAF(inp->inp_socket,
1284 AF_INET6))
1285 local_wild_mapped = inp;
1286 else
1287 #endif /* INET6 */
1288 local_wild = inp;
1289 }
1290 }
1291 }
1292 if (local_wild == NULL) {
1293 #if INET6
1294 if (local_wild_mapped != NULL) {
1295 if (in_pcb_checkstate(local_wild_mapped, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1296 lck_rw_done(pcbinfo->mtx);
1297 return (local_wild_mapped);
1298 }
1299 else { /* it's there but dead, say it isn't found */
1300 lck_rw_done(pcbinfo->mtx);
1301 return(NULL);
1302 }
1303 }
1304 #endif /* INET6 */
1305 lck_rw_done(pcbinfo->mtx);
1306 return (NULL);
1307 }
1308 if (in_pcb_checkstate(local_wild, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
1309 lck_rw_done(pcbinfo->mtx);
1310 return (local_wild);
1311 }
1312 else { /* it's there but dead, say it isn't found */
1313 lck_rw_done(pcbinfo->mtx);
1314 return(NULL);
1315 }
1316 }
1317
1318 /*
1319 * Not found.
1320 */
1321 lck_rw_done(pcbinfo->mtx);
1322 return (NULL);
1323 }
1324
1325 /*
1326 * Insert PCB onto various hash lists.
1327 */
1328 int
1329 in_pcbinshash(struct inpcb *inp, int locked)
1330 {
1331 struct inpcbhead *pcbhash;
1332 struct inpcbporthead *pcbporthash;
1333 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1334 struct inpcbport *phd;
1335 u_int32_t hashkey_faddr;
1336
1337 if (!locked) {
1338 if (!lck_rw_try_lock_exclusive(pcbinfo->mtx)) {
1339 /*lock inversion issue, mostly with udp multicast packets */
1340 socket_unlock(inp->inp_socket, 0);
1341 lck_rw_lock_exclusive(pcbinfo->mtx);
1342 socket_lock(inp->inp_socket, 0);
1343 }
1344 }
1345
1346 #if INET6
1347 if (inp->inp_vflag & INP_IPV6)
1348 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1349 else
1350 #endif /* INET6 */
1351 hashkey_faddr = inp->inp_faddr.s_addr;
1352
1353 inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->hashmask);
1354
1355 pcbhash = &pcbinfo->hashbase[inp->hash_element];
1356
1357 pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
1358 pcbinfo->porthashmask)];
1359
1360 /*
1361 * Go through port list and look for a head for this lport.
1362 */
1363 LIST_FOREACH(phd, pcbporthash, phd_hash) {
1364 if (phd->phd_port == inp->inp_lport)
1365 break;
1366 }
1367 /*
1368 * If none exists, malloc one and tack it on.
1369 */
1370 if (phd == NULL) {
1371 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_WAITOK);
1372 if (phd == NULL) {
1373 if (!locked)
1374 lck_rw_done(pcbinfo->mtx);
1375 return (ENOBUFS); /* XXX */
1376 }
1377 phd->phd_port = inp->inp_lport;
1378 LIST_INIT(&phd->phd_pcblist);
1379 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1380 }
1381 inp->inp_phd = phd;
1382 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1383 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1384 if (!locked)
1385 lck_rw_done(pcbinfo->mtx);
1386 return (0);
1387 }
1388
1389 /*
1390 * Move PCB to the proper hash bucket when { faddr, fport } have been
1391 * changed. NOTE: This does not handle the case of the lport changing (the
1392 * hashed port list would have to be updated as well), so the lport must
1393 * not change after in_pcbinshash() has been called.
1394 */
1395 void
1396 in_pcbrehash(struct inpcb *inp)
1397 {
1398 struct inpcbhead *head;
1399 u_int32_t hashkey_faddr;
1400
1401 #if INET6
1402 if (inp->inp_vflag & INP_IPV6)
1403 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1404 else
1405 #endif /* INET6 */
1406 hashkey_faddr = inp->inp_faddr.s_addr;
1407 inp->hash_element = INP_PCBHASH(hashkey_faddr, inp->inp_lport,
1408 inp->inp_fport, inp->inp_pcbinfo->hashmask);
1409 head = &inp->inp_pcbinfo->hashbase[inp->hash_element];
1410
1411 LIST_REMOVE(inp, inp_hash);
1412 LIST_INSERT_HEAD(head, inp, inp_hash);
1413 }
1414
1415 /*
1416 * Remove PCB from various lists.
1417 */
1418 //###LOCK must be called with list lock held
1419 void
1420 in_pcbremlists(struct inpcb *inp)
1421 {
1422 inp->inp_gencnt = ++inp->inp_pcbinfo->ipi_gencnt;
1423
1424 if (inp->inp_lport) {
1425 struct inpcbport *phd = inp->inp_phd;
1426
1427 LIST_REMOVE(inp, inp_hash);
1428 LIST_REMOVE(inp, inp_portlist);
1429 if (phd != NULL && (LIST_FIRST(&phd->phd_pcblist) == NULL)) {
1430 LIST_REMOVE(phd, phd_hash);
1431 FREE(phd, M_PCB);
1432 }
1433 }
1434 LIST_REMOVE(inp, inp_list);
1435 inp->inp_pcbinfo->ipi_count--;
1436 }
1437
1438 /* Mechanism used to defer the memory release of PCBs
1439 * The pcb list will contain the pcb until the ripper can clean it up if
1440 * the following conditions are met: 1) state "DEAD", 2) wantcnt is STOPUSING
1441 * 3) usecount is null
1442 * This function will be called to either mark the pcb as
1443 */
1444 int
1445 in_pcb_checkstate(struct inpcb *pcb, int mode, int locked)
1446 {
1447
1448 volatile UInt32 *wantcnt = (volatile UInt32 *)&pcb->inp_wantcnt;
1449 UInt32 origwant;
1450 UInt32 newwant;
1451
1452 switch (mode) {
1453
1454 case WNT_STOPUSING: /* try to mark the pcb as ready for recycling */
1455
1456 /* compareswap with STOPUSING, if success we're good, if it's in use, will be marked later */
1457
1458 if (locked == 0)
1459 socket_lock(pcb->inp_socket, 1);
1460 pcb->inp_state = INPCB_STATE_DEAD;
1461 stopusing:
1462 if (pcb->inp_socket->so_usecount < 0)
1463 panic("in_pcb_checkstate STOP pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1464 if (locked == 0)
1465 socket_unlock(pcb->inp_socket, 1);
1466
1467 origwant = *wantcnt;
1468 if ((UInt16) origwant == 0xffff ) /* should stop using */
1469 return (WNT_STOPUSING);
1470 newwant = 0xffff;
1471 if ((UInt16) origwant == 0) {/* try to mark it as unsuable now */
1472 OSCompareAndSwap(origwant, newwant, wantcnt) ;
1473 }
1474 return (WNT_STOPUSING);
1475 break;
1476
1477 case WNT_ACQUIRE: /* try to increase reference to pcb */
1478 /* if WNT_STOPUSING should bail out */
1479 /*
1480 * if socket state DEAD, try to set count to STOPUSING, return failed
1481 * otherwise increase cnt
1482 */
1483 do {
1484 origwant = *wantcnt;
1485 if ((UInt16) origwant == 0xffff ) {/* should stop using */
1486 // printf("in_pcb_checkstate: ACQ PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1487 return (WNT_STOPUSING);
1488 }
1489 newwant = origwant + 1;
1490 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1491 return (WNT_ACQUIRE);
1492 break;
1493
1494 case WNT_RELEASE: /* release reference. if result is null and pcb state is DEAD,
1495 set wanted bit to STOPUSING
1496 */
1497
1498 if (locked == 0)
1499 socket_lock(pcb->inp_socket, 1);
1500
1501 do {
1502 origwant = *wantcnt;
1503 if ((UInt16) origwant == 0x0 )
1504 panic("in_pcb_checkstate pcb=%p release with zero count", pcb);
1505 if ((UInt16) origwant == 0xffff ) {/* should stop using */
1506 #if TEMPDEBUG
1507 printf("in_pcb_checkstate: REL PCB was STOPUSING while release. odd pcb=%p\n", pcb);
1508 #endif
1509 if (locked == 0)
1510 socket_unlock(pcb->inp_socket, 1);
1511 return (WNT_STOPUSING);
1512 }
1513 newwant = origwant - 1;
1514 } while (!OSCompareAndSwap(origwant, newwant, wantcnt));
1515
1516 if (pcb->inp_state == INPCB_STATE_DEAD)
1517 goto stopusing;
1518 if (pcb->inp_socket->so_usecount < 0)
1519 panic("in_pcb_checkstate RELEASE pcb=%p so=%p usecount is negative\n", pcb, pcb->inp_socket);
1520
1521 if (locked == 0)
1522 socket_unlock(pcb->inp_socket, 1);
1523 return (WNT_RELEASE);
1524 break;
1525
1526 default:
1527
1528 panic("in_pcb_checkstate: so=%p not a valid state =%x\n", pcb->inp_socket, mode);
1529 }
1530
1531 /* NOTREACHED */
1532 return (mode);
1533 }
1534
1535 /*
1536 * inpcb_to_compat copies specific bits of an inpcb to a inpcb_compat.
1537 * The inpcb_compat data structure is passed to user space and must
1538 * not change. We intentionally avoid copying pointers.
1539 */
1540 void
1541 inpcb_to_compat(
1542 struct inpcb *inp,
1543 struct inpcb_compat *inp_compat)
1544 {
1545 bzero(inp_compat, sizeof(*inp_compat));
1546 inp_compat->inp_fport = inp->inp_fport;
1547 inp_compat->inp_lport = inp->inp_lport;
1548 inp_compat->nat_owner = inp->nat_owner;
1549 inp_compat->nat_cookie = inp->nat_cookie;
1550 inp_compat->inp_gencnt = inp->inp_gencnt;
1551 inp_compat->inp_flags = inp->inp_flags;
1552 inp_compat->inp_flow = inp->inp_flow;
1553 inp_compat->inp_vflag = inp->inp_vflag;
1554 inp_compat->inp_ip_ttl = inp->inp_ip_ttl;
1555 inp_compat->inp_ip_p = inp->inp_ip_p;
1556 inp_compat->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1557 inp_compat->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1558 inp_compat->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1559 inp_compat->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1560 inp_compat->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1561 inp_compat->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1562 inp_compat->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1563 }
1564
1565 #if !CONFIG_EMBEDDED
1566
1567 void
1568 inpcb_to_xinpcb64(
1569 struct inpcb *inp,
1570 struct xinpcb64 *xinp)
1571 {
1572 xinp->inp_fport = inp->inp_fport;
1573 xinp->inp_lport = inp->inp_lport;
1574 xinp->inp_gencnt = inp->inp_gencnt;
1575 xinp->inp_flags = inp->inp_flags;
1576 xinp->inp_flow = inp->inp_flow;
1577 xinp->inp_vflag = inp->inp_vflag;
1578 xinp->inp_ip_ttl = inp->inp_ip_ttl;
1579 xinp->inp_ip_p = inp->inp_ip_p;
1580 xinp->inp_dependfaddr.inp6_foreign = inp->inp_dependfaddr.inp6_foreign;
1581 xinp->inp_dependladdr.inp6_local = inp->inp_dependladdr.inp6_local;
1582 xinp->inp_depend4.inp4_ip_tos = inp->inp_depend4.inp4_ip_tos;
1583 xinp->inp_depend6.inp6_hlim = inp->inp_depend6.inp6_hlim;
1584 xinp->inp_depend6.inp6_cksum = inp->inp_depend6.inp6_cksum;
1585 xinp->inp_depend6.inp6_ifindex = inp->inp_depend6.inp6_ifindex;
1586 xinp->inp_depend6.inp6_hops = inp->inp_depend6.inp6_hops;
1587 }
1588
1589 #endif /* !CONFIG_EMBEDDED */
1590
1591 /*
1592 * The following routines implement this scheme:
1593 *
1594 * Callers of ip_output() that intend to cache the route in the inpcb pass
1595 * a local copy of the struct route to ip_output(). Using a local copy of
1596 * the cached route significantly simplifies things as IP no longer has to
1597 * worry about having exclusive access to the passed in struct route, since
1598 * it's defined in the caller's stack; in essence, this allows for a lock-
1599 * less operation when updating the struct route at the IP level and below,
1600 * whenever necessary. The scheme works as follows:
1601 *
1602 * Prior to dropping the socket's lock and calling ip_output(), the caller
1603 * copies the struct route from the inpcb into its stack, and adds a reference
1604 * to the cached route entry, if there was any. The socket's lock is then
1605 * dropped and ip_output() is called with a pointer to the copy of struct
1606 * route defined on the stack (not to the one in the inpcb.)
1607 *
1608 * Upon returning from ip_output(), the caller then acquires the socket's
1609 * lock and synchronizes the cache; if there is no route cached in the inpcb,
1610 * it copies the local copy of struct route (which may or may not contain any
1611 * route) back into the cache; otherwise, if the inpcb has a route cached in
1612 * it, the one in the local copy will be freed, if there's any. Trashing the
1613 * cached route in the inpcb can be avoided because ip_output() is single-
1614 * threaded per-PCB (i.e. multiple transmits on a PCB are always serialized
1615 * by the socket/transport layer.)
1616 */
1617 void
1618 inp_route_copyout(struct inpcb *inp, struct route *dst)
1619 {
1620 struct route *src = &inp->inp_route;
1621
1622 lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1623
1624 /* Minor sanity check */
1625 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
1626 panic("%s: wrong or corrupted route: %p", __func__, src);
1627
1628 /* Copy everything (rt, dst, flags) from PCB */
1629 bcopy(src, dst, sizeof (*dst));
1630
1631 /* Hold one reference for the local copy of struct route */
1632 if (dst->ro_rt != NULL)
1633 RT_ADDREF(dst->ro_rt);
1634 }
1635
1636 void
1637 inp_route_copyin(struct inpcb *inp, struct route *src)
1638 {
1639 struct route *dst = &inp->inp_route;
1640
1641 lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED);
1642
1643 /* Minor sanity check */
1644 if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET)
1645 panic("%s: wrong or corrupted route: %p", __func__, src);
1646
1647 /* No cached route in the PCB? */
1648 if (dst->ro_rt == NULL) {
1649 /*
1650 * Copy everything (rt, dst, flags) from ip_output();
1651 * the reference to the route was held at the time
1652 * it was allocated and is kept intact.
1653 */
1654 bcopy(src, dst, sizeof (*dst));
1655 } else if (src->ro_rt != NULL) {
1656 /*
1657 * If the same, update just the ro_flags and ditch the one
1658 * in the local copy. Else ditch the one that is currently
1659 * cached, and cache what we got back from ip_output().
1660 */
1661 if (dst->ro_rt == src->ro_rt) {
1662 dst->ro_flags = src->ro_flags;
1663 rtfree(src->ro_rt);
1664 src->ro_rt = NULL;
1665 } else {
1666 rtfree(dst->ro_rt);
1667 bcopy(src, dst, sizeof (*dst));
1668 }
1669 }
1670 }