]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
3d19b403ca0dc4bd75718a315290b01a4ce3dca6
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
24 /*
25 * Copyright (c) 1982, 1986, 1988, 1990, 1993
26 * The Regents of the University of California. All rights reserved.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 *
56 * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
57 */
58
59 #include <sys/param.h>
60 #include <sys/systm.h>
61 #include <sys/proc.h>
62 #include <sys/fcntl.h>
63 #include <sys/malloc.h>
64 #include <sys/mbuf.h>
65 #include <sys/domain.h>
66 #include <sys/kernel.h>
67 #include <sys/poll.h>
68 #include <sys/protosw.h>
69 #include <sys/socket.h>
70 #include <sys/socketvar.h>
71 #include <sys/resourcevar.h>
72 #include <sys/signalvar.h>
73 #include <sys/sysctl.h>
74 #include <sys/uio.h>
75 #include <sys/ev.h>
76 #include <sys/kdebug.h>
77 #include <net/route.h>
78 #include <netinet/in.h>
79 #include <netinet/in_pcb.h>
80 #include <kern/zalloc.h>
81 #include <machine/limits.h>
82
83 int so_cache_hw = 0;
84 int so_cache_timeouts = 0;
85 int so_cache_max_freed = 0;
86 int cached_sock_count = 0;
87 struct socket *socket_cache_head = 0;
88 struct socket *socket_cache_tail = 0;
89 u_long so_cache_time = 0;
90 int so_cache_init_done = 0;
91 struct zone *so_cache_zone;
92 extern int get_inpcb_str_size();
93 extern int get_tcp_str_size();
94
95 #include <machine/limits.h>
96
97 int socket_debug = 0;
98 int socket_zone = M_SOCKET;
99 so_gen_t so_gencnt; /* generation count for sockets */
100
101 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
102 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
103
104 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
105 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
106 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
107 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
108 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
109 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
110 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
111
112
113 SYSCTL_DECL(_kern_ipc);
114
115 static int somaxconn = SOMAXCONN;
116 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
117 0, "");
118
119 /* Should we get a maximum also ??? */
120 static int sosendmaxchain = 65536;
121 static int sosendminchain = 16384;
122 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain,
123 0, "");
124
125 void so_cache_timer();
126
127 /*
128 * Socket operation routines.
129 * These routines are called by the routines in
130 * sys_socket.c or from a system process, and
131 * implement the semantics of socket operations by
132 * switching out to the protocol specific routines.
133 */
134
135 void socketinit()
136 {
137 vm_size_t str_size;
138
139 so_cache_init_done = 1;
140
141 timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
142 str_size = (vm_size_t)( sizeof(struct socket) + 4 +
143 get_inpcb_str_size() + 4 +
144 get_tcp_str_size());
145 so_cache_zone = zinit (str_size, 120000*str_size, 8192, "socache zone");
146 #if TEMPDEBUG
147 kprintf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
148 #endif
149
150 }
151
152 void cached_sock_alloc(so, waitok)
153 struct socket **so;
154 int waitok;
155
156 {
157 caddr_t temp;
158 int s;
159 register u_long offset;
160
161
162 s = splnet();
163 if (cached_sock_count) {
164 cached_sock_count--;
165 *so = socket_cache_head;
166 if (*so == 0)
167 panic("cached_sock_alloc: cached sock is null");
168
169 socket_cache_head = socket_cache_head->cache_next;
170 if (socket_cache_head)
171 socket_cache_head->cache_prev = 0;
172 else
173 socket_cache_tail = 0;
174 splx(s);
175
176 temp = (*so)->so_saved_pcb;
177 bzero((caddr_t)*so, sizeof(struct socket));
178 #if TEMPDEBUG
179 kprintf("cached_sock_alloc - retreiving cached sock %x - count == %d\n", *so,
180 cached_sock_count);
181 #endif
182 (*so)->so_saved_pcb = temp;
183 }
184 else {
185 #if TEMPDEBUG
186 kprintf("Allocating cached sock %x from memory\n", *so);
187 #endif
188
189 splx(s);
190 if (waitok)
191 *so = (struct socket *) zalloc(so_cache_zone);
192 else
193 *so = (struct socket *) zalloc_noblock(so_cache_zone);
194
195 if (*so == 0)
196 return;
197
198 bzero((caddr_t)*so, sizeof(struct socket));
199
200 /*
201 * Define offsets for extra structures into our single block of
202 * memory. Align extra structures on longword boundaries.
203 */
204
205
206 offset = (u_long) *so;
207 offset += sizeof(struct socket);
208 if (offset & 0x3) {
209 offset += 4;
210 offset &= 0xfffffffc;
211 }
212 (*so)->so_saved_pcb = (caddr_t) offset;
213 offset += get_inpcb_str_size();
214 if (offset & 0x3) {
215 offset += 4;
216 offset &= 0xfffffffc;
217 }
218
219 ((struct inpcb *) (*so)->so_saved_pcb)->inp_saved_ppcb = (caddr_t) offset;
220 #if TEMPDEBUG
221 kprintf("Allocating cached socket - %x, pcb=%x tcpcb=%x\n", *so,
222 (*so)->so_saved_pcb,
223 ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb);
224 #endif
225 }
226
227 (*so)->cached_in_sock_layer = 1;
228 }
229
230
231 void cached_sock_free(so)
232 struct socket *so;
233 {
234 int s;
235
236
237 s = splnet();
238 if (++cached_sock_count > MAX_CACHED_SOCKETS) {
239 --cached_sock_count;
240 splx(s);
241 #if TEMPDEBUG
242 kprintf("Freeing overflowed cached socket %x\n", so);
243 #endif
244 zfree(so_cache_zone, (vm_offset_t) so);
245 }
246 else {
247 #if TEMPDEBUG
248 kprintf("Freeing socket %x into cache\n", so);
249 #endif
250 if (so_cache_hw < cached_sock_count)
251 so_cache_hw = cached_sock_count;
252
253 so->cache_next = socket_cache_head;
254 so->cache_prev = 0;
255 if (socket_cache_head)
256 socket_cache_head->cache_prev = so;
257 else
258 socket_cache_tail = so;
259
260 so->cache_timestamp = so_cache_time;
261 socket_cache_head = so;
262 splx(s);
263 }
264
265 #if TEMPDEBUG
266 kprintf("Freed cached sock %x into cache - count is %d\n", so, cached_sock_count);
267 #endif
268
269
270 }
271
272
273 void so_cache_timer()
274 {
275 register struct socket *p;
276 register int s;
277 register int n_freed = 0;
278 boolean_t funnel_state;
279
280 funnel_state = thread_funnel_set(network_flock, TRUE);
281
282 ++so_cache_time;
283
284 s = splnet();
285
286 while (p = socket_cache_tail)
287 {
288 if ((so_cache_time - p->cache_timestamp) < SO_CACHE_TIME_LIMIT)
289 break;
290
291 so_cache_timeouts++;
292
293 if (socket_cache_tail = p->cache_prev)
294 p->cache_prev->cache_next = 0;
295 if (--cached_sock_count == 0)
296 socket_cache_head = 0;
297
298 splx(s);
299
300 zfree(so_cache_zone, (vm_offset_t) p);
301
302 splnet();
303 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH)
304 {
305 so_cache_max_freed++;
306 break;
307 }
308 }
309 splx(s);
310
311 timeout(so_cache_timer, NULL, (SO_CACHE_FLUSH_INTERVAL * hz));
312
313 (void) thread_funnel_set(network_flock, FALSE);
314
315 }
316
317
318 /*
319 * Get a socket structure from our zone, and initialize it.
320 * We don't implement `waitok' yet (see comments in uipc_domain.c).
321 * Note that it would probably be better to allocate socket
322 * and PCB at the same time, but I'm not convinced that all
323 * the protocols can be easily modified to do this.
324 */
325 struct socket *
326 soalloc(waitok, dom, type)
327 int waitok;
328 int dom;
329 int type;
330 {
331 struct socket *so;
332
333 if ((dom == PF_INET) && (type == SOCK_STREAM))
334 cached_sock_alloc(&so, waitok);
335 else
336 {
337 so = _MALLOC_ZONE(sizeof(*so), socket_zone, M_WAITOK);
338 if (so)
339 bzero(so, sizeof *so);
340 }
341 /* XXX race condition for reentrant kernel */
342
343 if (so) {
344 so->so_gencnt = ++so_gencnt;
345 so->so_zone = socket_zone;
346 }
347
348 return so;
349 }
350
351 int
352 socreate(dom, aso, type, proto)
353 int dom;
354 struct socket **aso;
355 register int type;
356 int proto;
357
358 {
359 struct proc *p = current_proc();
360 register struct protosw *prp;
361 struct socket *so;
362 register int error = 0;
363
364 if (proto)
365 prp = pffindproto(dom, proto, type);
366 else
367 prp = pffindtype(dom, type);
368 if (prp == 0 || prp->pr_usrreqs->pru_attach == 0)
369 return (EPROTONOSUPPORT);
370 if (prp->pr_type != type)
371 return (EPROTOTYPE);
372 so = soalloc(p != 0, dom, type);
373 if (so == 0)
374 return (ENOBUFS);
375
376 TAILQ_INIT(&so->so_incomp);
377 TAILQ_INIT(&so->so_comp);
378 so->so_type = type;
379
380 if (p != 0) {
381 if (p->p_ucred->cr_uid == 0)
382 so->so_state = SS_PRIV;
383
384 so->so_uid = p->p_ucred->cr_uid;
385 }
386
387 so->so_proto = prp;
388 so->so_rcv.sb_flags |= SB_RECV; /* XXX */
389 if (prp->pr_sfilter.tqh_first)
390 error = sfilter_init(so);
391 if (error == 0)
392 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
393
394 if (error) {
395 so->so_state |= SS_NOFDREF;
396 sofree(so);
397 return (error);
398 }
399 prp->pr_domain->dom_refs++;
400 so->so_rcv.sb_so = so->so_snd.sb_so = so;
401 TAILQ_INIT(&so->so_evlist);
402 *aso = so;
403 return (0);
404 }
405
406 int
407 sobind(so, nam)
408 struct socket *so;
409 struct sockaddr *nam;
410
411 {
412 struct proc *p = current_proc();
413 int error;
414 struct kextcb *kp;
415 int s = splnet();
416
417 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
418 if (error == 0) /* ??? */
419 { kp = sotokextcb(so);
420 while (kp)
421 { if (kp->e_soif && kp->e_soif->sf_sobind)
422 { error = (*kp->e_soif->sf_sobind)(so, nam, kp);
423 if (error)
424 { if (error == EJUSTRETURN)
425 break;
426 splx(s);
427 return(error);
428 }
429 }
430 kp = kp->e_next;
431 }
432 }
433 splx(s);
434 return (error);
435 }
436
437 void
438 sodealloc(so)
439 struct socket *so;
440 {
441 so->so_gencnt = ++so_gencnt;
442
443 if (so->cached_in_sock_layer == 1)
444 cached_sock_free(so);
445 else
446 _FREE_ZONE(so, sizeof(*so), so->so_zone);
447 }
448
449 int
450 solisten(so, backlog)
451 register struct socket *so;
452 int backlog;
453
454 {
455 struct kextcb *kp;
456 struct proc *p = current_proc();
457 int s, error;
458
459 s = splnet();
460 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
461 if (error) {
462 splx(s);
463 return (error);
464 }
465 if (TAILQ_EMPTY(&so->so_comp))
466 so->so_options |= SO_ACCEPTCONN;
467 if (backlog < 0 || backlog > somaxconn)
468 backlog = somaxconn;
469 so->so_qlimit = backlog;
470 kp = sotokextcb(so);
471 while (kp)
472 {
473 if (kp->e_soif && kp->e_soif->sf_solisten)
474 { error = (*kp->e_soif->sf_solisten)(so, kp);
475 if (error)
476 { if (error == EJUSTRETURN)
477 break;
478 splx(s);
479 return(error);
480 }
481 }
482 kp = kp->e_next;
483 }
484
485 splx(s);
486 return (0);
487 }
488
489
490 void
491 sofree(so)
492 register struct socket *so;
493 { int error;
494 struct kextcb *kp;
495 struct socket *head = so->so_head;
496
497 kp = sotokextcb(so);
498 while (kp)
499 { if (kp->e_soif && kp->e_soif->sf_sofree)
500 { error = (*kp->e_soif->sf_sofree)(so, kp);
501 if (error) {
502 selthreadclear(&so->so_snd.sb_sel);
503 selthreadclear(&so->so_rcv.sb_sel);
504 return; /* void fn */
505 }
506 }
507 kp = kp->e_next;
508 }
509
510 if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
511 selthreadclear(&so->so_snd.sb_sel);
512 selthreadclear(&so->so_rcv.sb_sel);
513 return;
514 }
515 if (head != NULL) {
516 if (so->so_state & SS_INCOMP) {
517 TAILQ_REMOVE(&head->so_incomp, so, so_list);
518 head->so_incqlen--;
519 } else if (so->so_state & SS_COMP) {
520 /*
521 * We must not decommission a socket that's
522 * on the accept(2) queue. If we do, then
523 * accept(2) may hang after select(2) indicated
524 * that the listening socket was ready.
525 */
526 selthreadclear(&so->so_snd.sb_sel);
527 selthreadclear(&so->so_rcv.sb_sel);
528 return;
529 } else {
530 panic("sofree: not queued");
531 }
532 head->so_qlen--;
533 so->so_state &= ~(SS_INCOMP|SS_COMP);
534 so->so_head = NULL;
535 }
536
537 selthreadclear(&so->so_snd.sb_sel);
538 sbrelease(&so->so_snd);
539 sorflush(so);
540 sfilter_term(so);
541 sodealloc(so);
542 }
543
544 /*
545 * Close a socket on last file table reference removal.
546 * Initiate disconnect if connected.
547 * Free socket when disconnect complete.
548 */
549 int
550 soclose(so)
551 register struct socket *so;
552 {
553 int s = splnet(); /* conservative */
554 int error = 0;
555 struct kextcb *kp;
556
557 #if FB31SIG
558 funsetown(so->so_pgid);
559 #endif
560 kp = sotokextcb(so);
561 while (kp)
562 { if (kp->e_soif && kp->e_soif->sf_soclose)
563 { error = (*kp->e_soif->sf_soclose)(so, kp);
564 if (error)
565 { splx(s);
566 return((error == EJUSTRETURN) ? 0 : error);
567 }
568 }
569 kp = kp->e_next;
570 }
571
572 if (so->so_options & SO_ACCEPTCONN) {
573 struct socket *sp, *sonext;
574
575 sp = TAILQ_FIRST(&so->so_incomp);
576 for (; sp != NULL; sp = sonext) {
577 sonext = TAILQ_NEXT(sp, so_list);
578 (void) soabort(sp);
579 }
580 for (sp = TAILQ_FIRST(&so->so_comp); sp != NULL; sp = sonext) {
581 sonext = TAILQ_NEXT(sp, so_list);
582 /* Dequeue from so_comp since sofree() won't do it */
583 TAILQ_REMOVE(&so->so_comp, sp, so_list);
584 so->so_qlen--;
585 sp->so_state &= ~SS_COMP;
586 sp->so_head = NULL;
587 (void) soabort(sp);
588 }
589
590 }
591 if (so->so_pcb == 0)
592 goto discard;
593 if (so->so_state & SS_ISCONNECTED) {
594 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
595 error = sodisconnect(so);
596 if (error)
597 goto drop;
598 }
599 if (so->so_options & SO_LINGER) {
600 if ((so->so_state & SS_ISDISCONNECTING) &&
601 (so->so_state & SS_NBIO))
602 goto drop;
603 while (so->so_state & SS_ISCONNECTED) {
604 error = tsleep((caddr_t)&so->so_timeo,
605 PSOCK | PCATCH, "soclos", so->so_linger);
606 if (error)
607 break;
608 }
609 }
610 }
611 drop:
612 if (so->so_pcb) {
613 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
614 if (error == 0)
615 error = error2;
616 }
617 discard:
618 if (so->so_pcb && so->so_state & SS_NOFDREF)
619 panic("soclose: NOFDREF");
620 so->so_state |= SS_NOFDREF;
621 so->so_proto->pr_domain->dom_refs--;
622 evsofree(so);
623 sofree(so);
624 splx(s);
625 return (error);
626 }
627
628 /*
629 * Must be called at splnet...
630 */
631 int
632 soabort(so)
633 struct socket *so;
634 {
635
636 return (*so->so_proto->pr_usrreqs->pru_abort)(so);
637 }
638
639 int
640 soaccept(so, nam)
641 register struct socket *so;
642 struct sockaddr **nam;
643 { int s = splnet();
644 int error;
645 struct kextcb *kp;
646
647 if ((so->so_state & SS_NOFDREF) == 0)
648 panic("soaccept: !NOFDREF");
649 so->so_state &= ~SS_NOFDREF;
650 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
651 if (error == 0)
652 { kp = sotokextcb(so);
653 while (kp) {
654 if (kp->e_soif && kp->e_soif->sf_soaccept)
655 { error = (*kp->e_soif->sf_soaccept)(so, nam, kp);
656 if (error)
657 { if (error == EJUSTRETURN)
658 break;
659 splx(s);
660 return(error);
661 }
662 }
663 kp = kp->e_next;
664 }
665 }
666
667
668 splx(s);
669 return (error);
670 }
671
672 int
673 soconnect(so, nam)
674 register struct socket *so;
675 struct sockaddr *nam;
676
677 {
678 int s;
679 int error;
680 struct proc *p = current_proc();
681 struct kextcb *kp;
682
683 if (so->so_options & SO_ACCEPTCONN)
684 return (EOPNOTSUPP);
685 s = splnet();
686 /*
687 * If protocol is connection-based, can only connect once.
688 * Otherwise, if connected, try to disconnect first.
689 * This allows user to disconnect by connecting to, e.g.,
690 * a null address.
691 */
692 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
693 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
694 (error = sodisconnect(so))))
695 error = EISCONN;
696 else {
697 error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
698 if (error == 0)
699 {
700 kp = sotokextcb(so);
701 while (kp)
702 {
703 if (kp->e_soif && kp->e_soif->sf_soconnect)
704 { error = (*kp->e_soif->sf_soconnect)(so, nam, kp);
705 if (error)
706 { if (error == EJUSTRETURN)
707 break;
708 splx(s);
709 return(error);
710 }
711 }
712 kp = kp->e_next;
713 }
714 }
715 }
716
717 splx(s);
718 return (error);
719 }
720
721 int
722 soconnect2(so1, so2)
723 register struct socket *so1;
724 struct socket *so2;
725 {
726 int s = splnet();
727 int error;
728 struct kextcb *kp;
729
730 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
731 if (error == 0)
732 { kp = sotokextcb(so1);
733 while (kp)
734 { if (kp->e_soif && kp->e_soif->sf_soconnect2)
735 { error = (*kp->e_soif->sf_soconnect2)(so1, so2, kp);
736 if (error)
737 { if (error == EJUSTRETURN)
738 break;
739 splx(s);
740 return(error);
741 }
742 }
743 kp = kp->e_next;
744 }
745 }
746 splx(s);
747 return (error);
748 }
749
750 int
751 sodisconnect(so)
752 register struct socket *so;
753 {
754 int s = splnet();
755 int error;
756 struct kextcb *kp;
757
758 if ((so->so_state & SS_ISCONNECTED) == 0) {
759 error = ENOTCONN;
760 goto bad;
761 }
762 if (so->so_state & SS_ISDISCONNECTING) {
763 error = EALREADY;
764 goto bad;
765 }
766 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
767
768 if (error == 0)
769 { kp = sotokextcb(so);
770 while (kp)
771 { if (kp->e_soif && kp->e_soif->sf_sodisconnect)
772 { error = (*kp->e_soif->sf_sodisconnect)(so, kp);
773 if (error)
774 { if (error == EJUSTRETURN)
775 break;
776 splx(s);
777 return(error);
778 }
779 }
780 kp = kp->e_next;
781 }
782 }
783
784 bad:
785 splx(s);
786 return (error);
787 }
788
789 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
790 /*
791 * Send on a socket.
792 * If send must go all at once and message is larger than
793 * send buffering, then hard error.
794 * Lock against other senders.
795 * If must go all at once and not enough room now, then
796 * inform user that this would block and do nothing.
797 * Otherwise, if nonblocking, send as much as possible.
798 * The data to be sent is described by "uio" if nonzero,
799 * otherwise by the mbuf chain "top" (which must be null
800 * if uio is not). Data provided in mbuf chain must be small
801 * enough to send all at once.
802 *
803 * Returns nonzero on error, timeout or signal; callers
804 * must check for short counts if EINTR/ERESTART are returned.
805 * Data and control buffers are freed on return.
806 * Experiment:
807 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
808 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
809 * point at the mbuf chain being constructed and go from there.
810 */
811 int
812 sosend(so, addr, uio, top, control, flags)
813 register struct socket *so;
814 struct sockaddr *addr;
815 struct uio *uio;
816 struct mbuf *top;
817 struct mbuf *control;
818 int flags;
819
820 {
821 struct mbuf **mp;
822 register struct mbuf *m, *freelist = NULL;
823 register long space, len, resid;
824 int clen = 0, error, s, dontroute, mlen, sendflags;
825 int atomic = sosendallatonce(so) || top;
826 struct proc *p = current_proc();
827 struct kextcb *kp;
828
829 if (uio)
830 resid = uio->uio_resid;
831 else
832 resid = top->m_pkthdr.len;
833
834 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START),
835 so,
836 resid,
837 so->so_snd.sb_cc,
838 so->so_snd.sb_lowat,
839 so->so_snd.sb_hiwat);
840
841 /*
842 * In theory resid should be unsigned.
843 * However, space must be signed, as it might be less than 0
844 * if we over-committed, and we must use a signed comparison
845 * of space and resid. On the other hand, a negative resid
846 * causes us to loop sending 0-length segments to the protocol.
847 *
848 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
849 * type sockets since that's an error.
850 */
851 if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) {
852 error = EINVAL;
853 goto out;
854 }
855
856 dontroute =
857 (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
858 (so->so_proto->pr_flags & PR_ATOMIC);
859 if (p)
860 p->p_stats->p_ru.ru_msgsnd++;
861 if (control)
862 clen = control->m_len;
863 #define snderr(errno) { error = errno; splx(s); goto release; }
864
865 restart:
866 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
867 if (error)
868 goto out;
869 do {
870 s = splnet();
871 if (so->so_state & SS_CANTSENDMORE)
872 snderr(EPIPE);
873 if (so->so_error) {
874 error = so->so_error;
875 so->so_error = 0;
876 splx(s);
877 goto release;
878 }
879 if ((so->so_state & SS_ISCONNECTED) == 0) {
880 /*
881 * `sendto' and `sendmsg' is allowed on a connection-
882 * based socket if it supports implied connect.
883 * Return ENOTCONN if not connected and no address is
884 * supplied.
885 */
886 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
887 (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
888 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
889 !(resid == 0 && clen != 0))
890 snderr(ENOTCONN);
891 } else if (addr == 0 && !(flags&MSG_HOLD))
892 snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
893 ENOTCONN : EDESTADDRREQ);
894 }
895 space = sbspace(&so->so_snd);
896 if (flags & MSG_OOB)
897 space += 1024;
898 if ((atomic && resid > so->so_snd.sb_hiwat) ||
899 clen > so->so_snd.sb_hiwat)
900 snderr(EMSGSIZE);
901 if (space < resid + clen && uio &&
902 (atomic || space < so->so_snd.sb_lowat || space < clen)) {
903 if (so->so_state & SS_NBIO)
904 snderr(EWOULDBLOCK);
905 sbunlock(&so->so_snd);
906 error = sbwait(&so->so_snd);
907 splx(s);
908 if (error)
909 goto out;
910 goto restart;
911 }
912 splx(s);
913 mp = &top;
914 space -= clen;
915
916 do {
917 if (uio == NULL) {
918 /*
919 * Data is prepackaged in "top".
920 */
921 resid = 0;
922 if (flags & MSG_EOR)
923 top->m_flags |= M_EOR;
924 } else {
925 boolean_t dropped_funnel = FALSE;
926 int chainlength;
927 int bytes_to_copy;
928
929 bytes_to_copy = min(resid, space);
930
931 if (sosendminchain > 0) {
932 if (bytes_to_copy >= sosendminchain) {
933 dropped_funnel = TRUE;
934 (void)thread_funnel_set(network_flock, FALSE);
935 }
936 chainlength = 0;
937 } else
938 chainlength = sosendmaxchain;
939
940 do {
941
942 if (bytes_to_copy >= MINCLSIZE) {
943 if ((m = freelist) == NULL) {
944 int num_needed;
945 int hdrs_needed = 0;
946
947 if (top == 0)
948 hdrs_needed = 1;
949 num_needed = bytes_to_copy / MCLBYTES;
950
951 if ((bytes_to_copy - (num_needed * MCLBYTES)) >= MINCLSIZE)
952 num_needed++;
953
954 if ((freelist = m_getpackets(num_needed, hdrs_needed, M_WAIT)) == NULL)
955 goto getpackets_failed;
956 m = freelist;
957 }
958 freelist = m->m_next;
959 m->m_next = NULL;
960
961 mlen = MCLBYTES;
962 len = min(mlen, bytes_to_copy);
963 } else {
964 getpackets_failed:
965 if (top == 0) {
966 MGETHDR(m, M_WAIT, MT_DATA);
967 mlen = MHLEN;
968 m->m_pkthdr.len = 0;
969 m->m_pkthdr.rcvif = (struct ifnet *)0;
970 } else {
971 MGET(m, M_WAIT, MT_DATA);
972 mlen = MLEN;
973 }
974 len = min(mlen, bytes_to_copy);
975 /*
976 * For datagram protocols, leave room
977 * for protocol headers in first mbuf.
978 */
979 if (atomic && top == 0 && len < mlen)
980 MH_ALIGN(m, len);
981 }
982 chainlength += len;
983
984 space -= len;
985
986 error = uiomove(mtod(m, caddr_t), (int)len, uio);
987
988 resid = uio->uio_resid;
989
990 m->m_len = len;
991 *mp = m;
992 top->m_pkthdr.len += len;
993 if (error)
994 break;
995 mp = &m->m_next;
996 if (resid <= 0) {
997 if (flags & MSG_EOR)
998 top->m_flags |= M_EOR;
999 break;
1000 }
1001 bytes_to_copy = min(resid, space);
1002
1003 } while (space > 0 && (chainlength < sosendmaxchain || atomic || resid < MINCLSIZE));
1004
1005 if (dropped_funnel == TRUE)
1006 (void)thread_funnel_set(network_flock, TRUE);
1007 if (error)
1008 goto release;
1009 }
1010
1011 if (flags & (MSG_HOLD|MSG_SEND))
1012 { /* Enqueue for later, go away if HOLD */
1013 register struct mbuf *mb1;
1014 if (so->so_temp && (flags & MSG_FLUSH))
1015 { m_freem(so->so_temp);
1016 so->so_temp = NULL;
1017 }
1018 if (so->so_temp)
1019 so->so_tail->m_next = top;
1020 else
1021 so->so_temp = top;
1022 mb1 = top;
1023 while (mb1->m_next)
1024 mb1 = mb1->m_next;
1025 so->so_tail = mb1;
1026 if (flags&MSG_HOLD)
1027 { top = NULL;
1028 goto release;
1029 }
1030 top = so->so_temp;
1031 }
1032 if (dontroute)
1033 so->so_options |= SO_DONTROUTE;
1034 s = splnet(); /* XXX */
1035 kp = sotokextcb(so);
1036 /* Compute flags here, for pru_send and NKEs */
1037 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1038 /*
1039 * If the user set MSG_EOF, the protocol
1040 * understands this flag and nothing left to
1041 * send then use PRU_SEND_EOF instead of PRU_SEND.
1042 */
1043 ((flags & MSG_EOF) &&
1044 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1045 (resid <= 0)) ?
1046 PRUS_EOF :
1047 /* If there is more to send set PRUS_MORETOCOME */
1048 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1049 while (kp)
1050 { if (kp->e_soif && kp->e_soif->sf_sosend)
1051 { error = (*kp->e_soif->sf_sosend)(so, &addr,
1052 &uio, &top,
1053 &control,
1054 &sendflags,
1055 kp);
1056 if (error)
1057 { splx(s);
1058 if (error == EJUSTRETURN)
1059 { sbunlock(&so->so_snd);
1060
1061 if (freelist)
1062 m_freem_list(freelist);
1063 return(0);
1064 }
1065 goto release;
1066 }
1067 }
1068 kp = kp->e_next;
1069 }
1070
1071 error = (*so->so_proto->pr_usrreqs->pru_send)(so,
1072 sendflags, top, addr, control, p);
1073 splx(s);
1074 if (flags & MSG_SEND)
1075 so->so_temp = NULL;
1076
1077 if (dontroute)
1078 so->so_options &= ~SO_DONTROUTE;
1079 clen = 0;
1080 control = 0;
1081 top = 0;
1082 mp = &top;
1083 if (error)
1084 goto release;
1085 } while (resid && space > 0);
1086 } while (resid);
1087
1088 release:
1089 sbunlock(&so->so_snd);
1090 out:
1091 if (top)
1092 m_freem(top);
1093 if (control)
1094 m_freem(control);
1095 if (freelist)
1096 m_freem_list(freelist);
1097
1098 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END,
1099 so,
1100 resid,
1101 so->so_snd.sb_cc,
1102 space,
1103 error);
1104
1105 return (error);
1106 }
1107
1108 /*
1109 * Implement receive operations on a socket.
1110 * We depend on the way that records are added to the sockbuf
1111 * by sbappend*. In particular, each record (mbufs linked through m_next)
1112 * must begin with an address if the protocol so specifies,
1113 * followed by an optional mbuf or mbufs containing ancillary data,
1114 * and then zero or more mbufs of data.
1115 * In order to avoid blocking network interrupts for the entire time here,
1116 * we splx() while doing the actual copy to user space.
1117 * Although the sockbuf is locked, new data may still be appended,
1118 * and thus we must maintain consistency of the sockbuf during that time.
1119 *
1120 * The caller may receive the data as a single mbuf chain by supplying
1121 * an mbuf **mp0 for use in returning the chain. The uio is then used
1122 * only for the count in uio_resid.
1123 */
1124 int
1125 soreceive(so, psa, uio, mp0, controlp, flagsp)
1126 register struct socket *so;
1127 struct sockaddr **psa;
1128 struct uio *uio;
1129 struct mbuf **mp0;
1130 struct mbuf **controlp;
1131 int *flagsp;
1132 {
1133 register struct mbuf *m, **mp;
1134 register struct mbuf *free_list, *ml;
1135 register int flags, len, error, s, offset;
1136 struct protosw *pr = so->so_proto;
1137 struct mbuf *nextrecord;
1138 int moff, type = 0;
1139 int orig_resid = uio->uio_resid;
1140 struct kextcb *kp;
1141
1142 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START,
1143 so,
1144 uio->uio_resid,
1145 so->so_rcv.sb_cc,
1146 so->so_rcv.sb_lowat,
1147 so->so_rcv.sb_hiwat);
1148
1149 kp = sotokextcb(so);
1150 while (kp)
1151 { if (kp->e_soif && kp->e_soif->sf_soreceive)
1152 { error = (*kp->e_soif->sf_soreceive)(so, psa, &uio,
1153 mp0, controlp,
1154 flagsp, kp);
1155 if (error)
1156 return((error == EJUSTRETURN) ? 0 : error);
1157 }
1158 kp = kp->e_next;
1159 }
1160
1161 mp = mp0;
1162 if (psa)
1163 *psa = 0;
1164 if (controlp)
1165 *controlp = 0;
1166 if (flagsp)
1167 flags = *flagsp &~ MSG_EOR;
1168 else
1169 flags = 0;
1170 /*
1171 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1172 * regardless of the flags argument. Here is the case were
1173 * out-of-band data is not inline.
1174 */
1175 if ((flags & MSG_OOB) ||
1176 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1177 (so->so_options & SO_OOBINLINE) == 0 &&
1178 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1179 m = m_get(M_WAIT, MT_DATA);
1180 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
1181 if (error)
1182 goto bad;
1183 do {
1184 error = uiomove(mtod(m, caddr_t),
1185 (int) min(uio->uio_resid, m->m_len), uio);
1186 m = m_free(m);
1187 } while (uio->uio_resid && error == 0 && m);
1188 bad:
1189 if (m)
1190 m_freem(m);
1191 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
1192 if (error == EWOULDBLOCK || error == EINVAL) {
1193 /*
1194 * Let's try to get normal data:
1195 * EWOULDBLOCK: out-of-band data not receive yet;
1196 * EINVAL: out-of-band data already read.
1197 */
1198 error = 0;
1199 goto nooob;
1200 } else if (error == 0 && flagsp)
1201 *flagsp |= MSG_OOB;
1202 }
1203 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1204 return (error);
1205 }
1206 nooob:
1207 if (mp)
1208 *mp = (struct mbuf *)0;
1209 if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
1210 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
1211
1212 restart:
1213 if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags)))
1214 {
1215 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1216 return (error);
1217 }
1218 s = splnet();
1219
1220 m = so->so_rcv.sb_mb;
1221 /*
1222 * If we have less data than requested, block awaiting more
1223 * (subject to any timeout) if:
1224 * 1. the current count is less than the low water mark, or
1225 * 2. MSG_WAITALL is set, and it is possible to do the entire
1226 * receive operation at once if we block (resid <= hiwat).
1227 * 3. MSG_DONTWAIT is not set
1228 * If MSG_WAITALL is set but resid is larger than the receive buffer,
1229 * we have to do the receive in sections, and thus risk returning
1230 * a short count if a timeout or signal occurs after we start.
1231 */
1232 if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
1233 so->so_rcv.sb_cc < uio->uio_resid) &&
1234 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
1235 ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
1236 m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
1237 KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1"));
1238 if (so->so_error) {
1239 if (m)
1240 goto dontblock;
1241 error = so->so_error;
1242 if ((flags & MSG_PEEK) == 0)
1243 so->so_error = 0;
1244 goto release;
1245 }
1246 if (so->so_state & SS_CANTRCVMORE) {
1247 if (m)
1248 goto dontblock;
1249 else
1250 goto release;
1251 }
1252 for (; m; m = m->m_next)
1253 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1254 m = so->so_rcv.sb_mb;
1255 goto dontblock;
1256 }
1257 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
1258 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
1259 error = ENOTCONN;
1260 goto release;
1261 }
1262 if (uio->uio_resid == 0)
1263 goto release;
1264 if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) {
1265 error = EWOULDBLOCK;
1266 goto release;
1267 }
1268 sbunlock(&so->so_rcv);
1269 if (socket_debug)
1270 printf("Waiting for socket data\n");
1271 error = sbwait(&so->so_rcv);
1272 if (socket_debug)
1273 printf("SORECEIVE - sbwait returned %d\n", error);
1274 splx(s);
1275 if (error)
1276 {
1277 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,0,0,0,0);
1278 return (error);
1279 }
1280 goto restart;
1281 }
1282 dontblock:
1283 #ifdef notyet /* XXXX */
1284 if (uio->uio_procp)
1285 uio->uio_procp->p_stats->p_ru.ru_msgrcv++;
1286 #endif
1287 nextrecord = m->m_nextpkt;
1288 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
1289 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
1290 orig_resid = 0;
1291 if (psa)
1292 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
1293 mp0 == 0);
1294 if (flags & MSG_PEEK) {
1295 m = m->m_next;
1296 } else {
1297 sbfree(&so->so_rcv, m);
1298 MFREE(m, so->so_rcv.sb_mb);
1299 m = so->so_rcv.sb_mb;
1300 }
1301 }
1302 while (m && m->m_type == MT_CONTROL && error == 0) {
1303 if (flags & MSG_PEEK) {
1304 if (controlp)
1305 *controlp = m_copy(m, 0, m->m_len);
1306 m = m->m_next;
1307 } else {
1308 sbfree(&so->so_rcv, m);
1309 if (controlp) {
1310 if (pr->pr_domain->dom_externalize &&
1311 mtod(m, struct cmsghdr *)->cmsg_type ==
1312 SCM_RIGHTS)
1313 error = (*pr->pr_domain->dom_externalize)(m);
1314 *controlp = m;
1315 so->so_rcv.sb_mb = m->m_next;
1316 m->m_next = 0;
1317 m = so->so_rcv.sb_mb;
1318 } else {
1319 MFREE(m, so->so_rcv.sb_mb);
1320 m = so->so_rcv.sb_mb;
1321 }
1322 }
1323 if (controlp) {
1324 orig_resid = 0;
1325 controlp = &(*controlp)->m_next;
1326 }
1327 }
1328 if (m) {
1329 if ((flags & MSG_PEEK) == 0)
1330 m->m_nextpkt = nextrecord;
1331 type = m->m_type;
1332 if (type == MT_OOBDATA)
1333 flags |= MSG_OOB;
1334 }
1335 moff = 0;
1336 offset = 0;
1337
1338 free_list = m;
1339 ml = (struct mbuf *)0;
1340
1341 while (m && uio->uio_resid > 0 && error == 0) {
1342 if (m->m_type == MT_OOBDATA) {
1343 if (type != MT_OOBDATA)
1344 break;
1345 } else if (type == MT_OOBDATA)
1346 break;
1347 #if 0
1348 /*
1349 * This assertion needs rework. The trouble is Appletalk is uses many
1350 * mbuf types (NOT listed in mbuf.h!) which will trigger this panic.
1351 * For now just remove the assertion... CSM 9/98
1352 */
1353 else
1354 KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
1355 ("receive 3"));
1356 #endif
1357 /*
1358 * Make sure to allways set MSG_OOB event when getting
1359 * out of band data inline.
1360 */
1361 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
1362 (so->so_options & SO_OOBINLINE) != 0 &&
1363 (so->so_state & SS_RCVATMARK) != 0) {
1364 flags |= MSG_OOB;
1365 }
1366 so->so_state &= ~SS_RCVATMARK;
1367 len = uio->uio_resid;
1368 if (so->so_oobmark && len > so->so_oobmark - offset)
1369 len = so->so_oobmark - offset;
1370 if (len > m->m_len - moff)
1371 len = m->m_len - moff;
1372 /*
1373 * If mp is set, just pass back the mbufs.
1374 * Otherwise copy them out via the uio, then free.
1375 * Sockbuf must be consistent here (points to current mbuf,
1376 * it points to next record) when we drop priority;
1377 * we must note any additions to the sockbuf when we
1378 * block interrupts again.
1379 */
1380 if (mp == 0) {
1381 splx(s);
1382 error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
1383 s = splnet();
1384 if (error)
1385 goto release;
1386 } else
1387 uio->uio_resid -= len;
1388 if (len == m->m_len - moff) {
1389 if (m->m_flags & M_EOR)
1390 flags |= MSG_EOR;
1391 if (flags & MSG_PEEK) {
1392 m = m->m_next;
1393 moff = 0;
1394 } else {
1395 nextrecord = m->m_nextpkt;
1396 sbfree(&so->so_rcv, m);
1397 if (mp) {
1398 *mp = m;
1399 mp = &m->m_next;
1400 so->so_rcv.sb_mb = m = m->m_next;
1401 *mp = (struct mbuf *)0;
1402 } else {
1403 m->m_nextpkt = 0;
1404 if (ml != 0)
1405 ml->m_next = m;
1406 ml = m;
1407 so->so_rcv.sb_mb = m = m->m_next;
1408 ml->m_next = 0;
1409 }
1410 if (m)
1411 m->m_nextpkt = nextrecord;
1412 }
1413 } else {
1414 if (flags & MSG_PEEK)
1415 moff += len;
1416 else {
1417 if (mp)
1418 *mp = m_copym(m, 0, len, M_WAIT);
1419 m->m_data += len;
1420 m->m_len -= len;
1421 so->so_rcv.sb_cc -= len;
1422 }
1423 }
1424 if (so->so_oobmark) {
1425 if ((flags & MSG_PEEK) == 0) {
1426 so->so_oobmark -= len;
1427 if (so->so_oobmark == 0) {
1428 so->so_state |= SS_RCVATMARK;
1429 postevent(so, 0, EV_OOB);
1430 break;
1431 }
1432 } else {
1433 offset += len;
1434 if (offset == so->so_oobmark)
1435 break;
1436 }
1437 }
1438 if (flags & MSG_EOR)
1439 break;
1440 /*
1441 * If the MSG_WAITALL flag is set (for non-atomic socket),
1442 * we must not quit until "uio->uio_resid == 0" or an error
1443 * termination. If a signal/timeout occurs, return
1444 * with a short count but without error.
1445 * Keep sockbuf locked against other readers.
1446 */
1447 while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 &&
1448 !sosendallatonce(so) && !nextrecord) {
1449 if (so->so_error || so->so_state & SS_CANTRCVMORE)
1450 break;
1451
1452 if (ml) {
1453 m_freem_list(free_list);
1454 }
1455 error = sbwait(&so->so_rcv);
1456 if (error) {
1457 sbunlock(&so->so_rcv);
1458 splx(s);
1459 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, 0,0,0,0,0);
1460 return (0);
1461 }
1462 m = so->so_rcv.sb_mb;
1463 if (m) {
1464 nextrecord = m->m_nextpkt;
1465 free_list = m;
1466 }
1467 ml = (struct mbuf *)0;
1468 }
1469 }
1470 if (ml) {
1471 m_freem_list(free_list);
1472 }
1473
1474 if (m && pr->pr_flags & PR_ATOMIC) {
1475 if (so->so_options & SO_DONTTRUNC)
1476 flags |= MSG_RCVMORE;
1477 else
1478 { flags |= MSG_TRUNC;
1479 if ((flags & MSG_PEEK) == 0)
1480 (void) sbdroprecord(&so->so_rcv);
1481 }
1482 }
1483 if ((flags & MSG_PEEK) == 0) {
1484 if (m == 0)
1485 so->so_rcv.sb_mb = nextrecord;
1486 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
1487 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
1488 }
1489 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
1490 flags |= MSG_HAVEMORE;
1491 if (orig_resid == uio->uio_resid && orig_resid &&
1492 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
1493 sbunlock(&so->so_rcv);
1494 splx(s);
1495 goto restart;
1496 }
1497
1498 if (flagsp)
1499 *flagsp |= flags;
1500 release:
1501 sbunlock(&so->so_rcv);
1502 splx(s);
1503
1504 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
1505 so,
1506 uio->uio_resid,
1507 so->so_rcv.sb_cc,
1508 0,
1509 error);
1510
1511 return (error);
1512 }
1513
1514 int
1515 soshutdown(so, how)
1516 register struct socket *so;
1517 register int how;
1518 {
1519 register struct protosw *pr = so->so_proto;
1520 struct kextcb *kp;
1521 int ret;
1522
1523
1524 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, 0,0,0,0,0);
1525 kp = sotokextcb(so);
1526 while (kp)
1527 { if (kp->e_soif && kp->e_soif->sf_soshutdown)
1528 { ret = (*kp->e_soif->sf_soshutdown)(so, how, kp);
1529 if (ret)
1530 return((ret == EJUSTRETURN) ? 0 : ret);
1531 }
1532 kp = kp->e_next;
1533 }
1534
1535 how++;
1536 if (how & FREAD) {
1537 sorflush(so);
1538 postevent(so, 0, EV_RCLOSED);
1539 }
1540 if (how & FWRITE) {
1541 ret = ((*pr->pr_usrreqs->pru_shutdown)(so));
1542 postevent(so, 0, EV_WCLOSED);
1543 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0);
1544 return(ret);
1545 }
1546
1547 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0,0,0,0,0);
1548 return (0);
1549 }
1550
1551 void
1552 sorflush(so)
1553 register struct socket *so;
1554 {
1555 register struct sockbuf *sb = &so->so_rcv;
1556 register struct protosw *pr = so->so_proto;
1557 register int s, error;
1558 struct sockbuf asb;
1559 struct kextcb *kp;
1560
1561 kp = sotokextcb(so);
1562 while (kp)
1563 { if (kp->e_soif && kp->e_soif->sf_sorflush)
1564 { if ((*kp->e_soif->sf_sorflush)(so, kp))
1565 return;
1566 }
1567 kp = kp->e_next;
1568 }
1569
1570 sb->sb_flags |= SB_NOINTR;
1571 (void) sblock(sb, M_WAIT);
1572 s = splimp();
1573 socantrcvmore(so);
1574 sbunlock(sb);
1575 selthreadclear(&sb->sb_sel);
1576 asb = *sb;
1577 bzero((caddr_t)sb, sizeof (*sb));
1578 splx(s);
1579 if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
1580 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
1581 sbrelease(&asb);
1582 }
1583
1584 /*
1585 * Perhaps this routine, and sooptcopyout(), below, ought to come in
1586 * an additional variant to handle the case where the option value needs
1587 * to be some kind of integer, but not a specific size.
1588 * In addition to their use here, these functions are also called by the
1589 * protocol-level pr_ctloutput() routines.
1590 */
1591 int
1592 sooptcopyin(sopt, buf, len, minlen)
1593 struct sockopt *sopt;
1594 void *buf;
1595 size_t len;
1596 size_t minlen;
1597 {
1598 size_t valsize;
1599
1600 /*
1601 * If the user gives us more than we wanted, we ignore it,
1602 * but if we don't get the minimum length the caller
1603 * wants, we return EINVAL. On success, sopt->sopt_valsize
1604 * is set to however much we actually retrieved.
1605 */
1606 if ((valsize = sopt->sopt_valsize) < minlen)
1607 return EINVAL;
1608 if (valsize > len)
1609 sopt->sopt_valsize = valsize = len;
1610
1611 if (sopt->sopt_p != 0)
1612 return (copyin(sopt->sopt_val, buf, valsize));
1613
1614 bcopy(sopt->sopt_val, buf, valsize);
1615 return 0;
1616 }
1617
1618 int
1619 sosetopt(so, sopt)
1620 struct socket *so;
1621 struct sockopt *sopt;
1622 {
1623 int error, optval;
1624 struct linger l;
1625 struct timeval tv;
1626 short val;
1627 struct kextcb *kp;
1628
1629 kp = sotokextcb(so);
1630 while (kp)
1631 { if (kp->e_soif && kp->e_soif->sf_socontrol)
1632 { error = (*kp->e_soif->sf_socontrol)(so, sopt, kp);
1633 if (error)
1634 return((error == EJUSTRETURN) ? 0 : error);
1635 }
1636 kp = kp->e_next;
1637 }
1638
1639 error = 0;
1640 if (sopt->sopt_level != SOL_SOCKET) {
1641 if (so->so_proto && so->so_proto->pr_ctloutput)
1642 return ((*so->so_proto->pr_ctloutput)
1643 (so, sopt));
1644 error = ENOPROTOOPT;
1645 } else {
1646 switch (sopt->sopt_name) {
1647 case SO_LINGER:
1648 error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
1649 if (error)
1650 goto bad;
1651
1652 so->so_linger = l.l_linger;
1653 if (l.l_onoff)
1654 so->so_options |= SO_LINGER;
1655 else
1656 so->so_options &= ~SO_LINGER;
1657 break;
1658
1659 case SO_DEBUG:
1660 case SO_KEEPALIVE:
1661 case SO_DONTROUTE:
1662 case SO_USELOOPBACK:
1663 case SO_BROADCAST:
1664 case SO_REUSEADDR:
1665 case SO_REUSEPORT:
1666 case SO_OOBINLINE:
1667 case SO_TIMESTAMP:
1668 case SO_DONTTRUNC:
1669 case SO_WANTMORE:
1670 case SO_WANTOOBFLAG:
1671 error = sooptcopyin(sopt, &optval, sizeof optval,
1672 sizeof optval);
1673 if (error)
1674 goto bad;
1675 if (optval)
1676 so->so_options |= sopt->sopt_name;
1677 else
1678 so->so_options &= ~sopt->sopt_name;
1679 break;
1680
1681 case SO_SNDBUF:
1682 case SO_RCVBUF:
1683 case SO_SNDLOWAT:
1684 case SO_RCVLOWAT:
1685 error = sooptcopyin(sopt, &optval, sizeof optval,
1686 sizeof optval);
1687 if (error)
1688 goto bad;
1689
1690 /*
1691 * Values < 1 make no sense for any of these
1692 * options, so disallow them.
1693 */
1694 if (optval < 1) {
1695 error = EINVAL;
1696 goto bad;
1697 }
1698
1699 switch (sopt->sopt_name) {
1700 case SO_SNDBUF:
1701 case SO_RCVBUF:
1702 if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
1703 &so->so_snd : &so->so_rcv,
1704 (u_long) optval) == 0) {
1705 error = ENOBUFS;
1706 goto bad;
1707 }
1708 break;
1709
1710 /*
1711 * Make sure the low-water is never greater than
1712 * the high-water.
1713 */
1714 case SO_SNDLOWAT:
1715 so->so_snd.sb_lowat =
1716 (optval > so->so_snd.sb_hiwat) ?
1717 so->so_snd.sb_hiwat : optval;
1718 break;
1719 case SO_RCVLOWAT:
1720 so->so_rcv.sb_lowat =
1721 (optval > so->so_rcv.sb_hiwat) ?
1722 so->so_rcv.sb_hiwat : optval;
1723 break;
1724 }
1725 break;
1726
1727 case SO_SNDTIMEO:
1728 case SO_RCVTIMEO:
1729 error = sooptcopyin(sopt, &tv, sizeof tv,
1730 sizeof tv);
1731 if (error)
1732 goto bad;
1733
1734 if (tv.tv_sec > SHRT_MAX / hz - hz) {
1735 error = EDOM;
1736 goto bad;
1737 }
1738 val = tv.tv_sec * hz + tv.tv_usec / tick;
1739
1740 switch (sopt->sopt_name) {
1741 case SO_SNDTIMEO:
1742 so->so_snd.sb_timeo = val;
1743 break;
1744 case SO_RCVTIMEO:
1745 so->so_rcv.sb_timeo = val;
1746 break;
1747 }
1748 break;
1749
1750 case SO_NKE:
1751 { struct so_nke nke;
1752 struct NFDescriptor *nf1, *nf2 = NULL;
1753
1754 error = sooptcopyin(sopt, &nke,
1755 sizeof nke, sizeof nke);
1756 if (error)
1757 goto bad;
1758
1759 error = nke_insert(so, &nke);
1760 break;
1761 }
1762
1763 default:
1764 error = ENOPROTOOPT;
1765 break;
1766 }
1767 if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) {
1768 (void) ((*so->so_proto->pr_ctloutput)
1769 (so, sopt));
1770 }
1771 }
1772 bad:
1773 return (error);
1774 }
1775
1776 /* Helper routine for getsockopt */
1777 int
1778 sooptcopyout(sopt, buf, len)
1779 struct sockopt *sopt;
1780 void *buf;
1781 size_t len;
1782 {
1783 int error;
1784 size_t valsize;
1785
1786 error = 0;
1787
1788 /*
1789 * Documented get behavior is that we always return a value,
1790 * possibly truncated to fit in the user's buffer.
1791 * Traditional behavior is that we always tell the user
1792 * precisely how much we copied, rather than something useful
1793 * like the total amount we had available for her.
1794 * Note that this interface is not idempotent; the entire answer must
1795 * generated ahead of time.
1796 */
1797 valsize = min(len, sopt->sopt_valsize);
1798 sopt->sopt_valsize = valsize;
1799 if (sopt->sopt_val != 0) {
1800 if (sopt->sopt_p != 0)
1801 error = copyout(buf, sopt->sopt_val, valsize);
1802 else
1803 bcopy(buf, sopt->sopt_val, valsize);
1804 }
1805 return error;
1806 }
1807
1808 int
1809 sogetopt(so, sopt)
1810 struct socket *so;
1811 struct sockopt *sopt;
1812 {
1813 int error, optval;
1814 struct linger l;
1815 struct timeval tv;
1816 struct mbuf *m;
1817 struct kextcb *kp;
1818
1819 kp = sotokextcb(so);
1820 while (kp)
1821 { if (kp->e_soif && kp->e_soif->sf_socontrol)
1822 { error = (*kp->e_soif->sf_socontrol)(so, sopt, kp);
1823 if (error)
1824 return((error == EJUSTRETURN) ? 0 : error);
1825 }
1826 kp = kp->e_next;
1827 }
1828
1829 error = 0;
1830 if (sopt->sopt_level != SOL_SOCKET) {
1831 if (so->so_proto && so->so_proto->pr_ctloutput) {
1832 return ((*so->so_proto->pr_ctloutput)
1833 (so, sopt));
1834 } else
1835 return (ENOPROTOOPT);
1836 } else {
1837 switch (sopt->sopt_name) {
1838 case SO_LINGER:
1839 l.l_onoff = so->so_options & SO_LINGER;
1840 l.l_linger = so->so_linger;
1841 error = sooptcopyout(sopt, &l, sizeof l);
1842 break;
1843
1844 case SO_USELOOPBACK:
1845 case SO_DONTROUTE:
1846 case SO_DEBUG:
1847 case SO_KEEPALIVE:
1848 case SO_REUSEADDR:
1849 case SO_REUSEPORT:
1850 case SO_BROADCAST:
1851 case SO_OOBINLINE:
1852 case SO_TIMESTAMP:
1853 case SO_DONTTRUNC:
1854 case SO_WANTMORE:
1855 case SO_WANTOOBFLAG:
1856 optval = so->so_options & sopt->sopt_name;
1857 integer:
1858 error = sooptcopyout(sopt, &optval, sizeof optval);
1859 break;
1860
1861 case SO_TYPE:
1862 optval = so->so_type;
1863 goto integer;
1864
1865 case SO_NREAD:
1866 { int pkt_total;
1867 struct mbuf *m1;
1868
1869 pkt_total = 0;
1870 m1 = so->so_rcv.sb_mb;
1871 if (so->so_proto->pr_flags & PR_ATOMIC)
1872 {
1873 #if 0
1874 kprintf("SKT CC: %d\n", so->so_rcv.sb_cc);
1875 #endif
1876 while (m1)
1877 { if (m1->m_type == MT_DATA)
1878 pkt_total += m1->m_len;
1879 #if 0
1880 kprintf("CNT: %d/%d\n", m1->m_len, pkt_total);
1881 #endif
1882 m1 = m1->m_next;
1883 }
1884 optval = pkt_total;
1885 } else
1886 optval = so->so_rcv.sb_cc;
1887 #if 0
1888 kprintf("RTN: %d\n", optval);
1889 #endif
1890 goto integer;
1891 }
1892 case SO_ERROR:
1893 optval = so->so_error;
1894 so->so_error = 0;
1895 goto integer;
1896
1897 case SO_SNDBUF:
1898 optval = so->so_snd.sb_hiwat;
1899 goto integer;
1900
1901 case SO_RCVBUF:
1902 optval = so->so_rcv.sb_hiwat;
1903 goto integer;
1904
1905 case SO_SNDLOWAT:
1906 optval = so->so_snd.sb_lowat;
1907 goto integer;
1908
1909 case SO_RCVLOWAT:
1910 optval = so->so_rcv.sb_lowat;
1911 goto integer;
1912
1913 case SO_SNDTIMEO:
1914 case SO_RCVTIMEO:
1915 optval = (sopt->sopt_name == SO_SNDTIMEO ?
1916 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1917
1918 tv.tv_sec = optval / hz;
1919 tv.tv_usec = (optval % hz) * tick;
1920 error = sooptcopyout(sopt, &tv, sizeof tv);
1921 break;
1922
1923 default:
1924 error = ENOPROTOOPT;
1925 break;
1926 }
1927 return (error);
1928 }
1929 }
1930
1931 void
1932 sohasoutofband(so)
1933 register struct socket *so;
1934 {
1935 struct proc *p;
1936
1937 struct kextcb *kp;
1938
1939 kp = sotokextcb(so);
1940 while (kp)
1941 { if (kp->e_soif && kp->e_soif->sf_sohasoutofband)
1942 { if ((*kp->e_soif->sf_sohasoutofband)(so, kp))
1943 return;
1944 }
1945 kp = kp->e_next;
1946 }
1947 if (so->so_pgid < 0)
1948 gsignal(-so->so_pgid, SIGURG);
1949 else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
1950 psignal(p, SIGURG);
1951 selwakeup(&so->so_rcv.sb_sel);
1952 }
1953
1954 /*
1955 * Network filter support
1956 */
1957 /* Run the list of filters, creating extension control blocks */
1958 sfilter_init(register struct socket *so)
1959 { struct kextcb *kp, **kpp;
1960 struct protosw *prp;
1961 struct NFDescriptor *nfp;
1962
1963 prp = so->so_proto;
1964 nfp = prp->pr_sfilter.tqh_first; /* non-null */
1965 kpp = &so->so_ext;
1966 kp = NULL;
1967 while (nfp)
1968 { MALLOC(kp, struct kextcb *, sizeof(*kp),
1969 M_TEMP, M_WAITOK);
1970 if (kp == NULL)
1971 return(ENOBUFS); /* so_free will clean up */
1972 *kpp = kp;
1973 kpp = &kp->e_next;
1974 kp->e_next = NULL;
1975 kp->e_fcb = NULL;
1976 kp->e_nfd = nfp;
1977 kp->e_soif = nfp->nf_soif;
1978 kp->e_sout = nfp->nf_soutil;
1979 /*
1980 * Ignore return value for create
1981 * Everyone gets a chance at startup
1982 */
1983 if (kp->e_soif && kp->e_soif->sf_socreate)
1984 (*kp->e_soif->sf_socreate)(so, prp, kp);
1985 nfp = nfp->nf_next.tqe_next;
1986 }
1987 return(0);
1988 }
1989
1990
1991 /*
1992 * Run the list of filters, freeing extension control blocks
1993 * Assumes the soif/soutil blocks have been handled.
1994 */
1995 sfilter_term(struct socket *so)
1996 { struct kextcb *kp, *kp1;
1997
1998 kp = so->so_ext;
1999 while (kp)
2000 { kp1 = kp->e_next;
2001 /*
2002 * Ignore return code on termination; everyone must
2003 * get terminated.
2004 */
2005 if (kp->e_soif && kp->e_soif->sf_sofree)
2006 kp->e_soif->sf_sofree(so, kp);
2007 FREE(kp, M_TEMP);
2008 kp = kp1;
2009 }
2010 return(0);
2011 }
2012
2013
2014 int
2015 sopoll(struct socket *so, int events, struct ucred *cred, void * wql)
2016 {
2017 struct proc *p = current_proc();
2018 int revents = 0;
2019 int s = splnet();
2020
2021 if (events & (POLLIN | POLLRDNORM))
2022 if (soreadable(so))
2023 revents |= events & (POLLIN | POLLRDNORM);
2024
2025 if (events & (POLLOUT | POLLWRNORM))
2026 if (sowriteable(so))
2027 revents |= events & (POLLOUT | POLLWRNORM);
2028
2029 if (events & (POLLPRI | POLLRDBAND))
2030 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
2031 revents |= events & (POLLPRI | POLLRDBAND);
2032
2033 if (revents == 0) {
2034 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2035 so->so_rcv.sb_flags |= SB_SEL;
2036 selrecord(p, &so->so_rcv.sb_sel, wql);
2037 }
2038
2039 if (events & (POLLOUT | POLLWRNORM)) {
2040 so->so_snd.sb_flags |= SB_SEL;
2041 selrecord(p, &so->so_snd.sb_sel, wql);
2042 }
2043 }
2044
2045 splx(s);
2046 return (revents);
2047 }
2048
2049 /*#### IPv6 Integration. Added new routines */
2050 int
2051 sooptgetm(struct sockopt *sopt, struct mbuf **mp)
2052 {
2053 struct mbuf *m, *m_prev;
2054 int sopt_size = sopt->sopt_valsize;
2055
2056 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
2057 if (m == 0)
2058 return ENOBUFS;
2059 if (sopt_size > MLEN) {
2060 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
2061 if ((m->m_flags & M_EXT) == 0) {
2062 m_free(m);
2063 return ENOBUFS;
2064 }
2065 m->m_len = min(MCLBYTES, sopt_size);
2066 } else {
2067 m->m_len = min(MLEN, sopt_size);
2068 }
2069 sopt_size -= m->m_len;
2070 *mp = m;
2071 m_prev = m;
2072
2073 while (sopt_size) {
2074 MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA);
2075 if (m == 0) {
2076 m_freem(*mp);
2077 return ENOBUFS;
2078 }
2079 if (sopt_size > MLEN) {
2080 MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT);
2081 if ((m->m_flags & M_EXT) == 0) {
2082 m_freem(*mp);
2083 return ENOBUFS;
2084 }
2085 m->m_len = min(MCLBYTES, sopt_size);
2086 } else {
2087 m->m_len = min(MLEN, sopt_size);
2088 }
2089 sopt_size -= m->m_len;
2090 m_prev->m_next = m;
2091 m_prev = m;
2092 }
2093 return 0;
2094 }
2095
2096 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
2097 int
2098 sooptmcopyin(struct sockopt *sopt, struct mbuf *m)
2099 {
2100 struct mbuf *m0 = m;
2101
2102 if (sopt->sopt_val == NULL)
2103 return 0;
2104 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2105 if (sopt->sopt_p != NULL) {
2106 int error;
2107
2108 error = copyin(sopt->sopt_val, mtod(m, char *),
2109 m->m_len);
2110 if (error != 0) {
2111 m_freem(m0);
2112 return(error);
2113 }
2114 } else
2115 bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
2116 sopt->sopt_valsize -= m->m_len;
2117 (caddr_t)sopt->sopt_val += m->m_len;
2118 m = m->m_next;
2119 }
2120 if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
2121 panic("sooptmcopyin");
2122 return 0;
2123 }
2124
2125 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
2126 int
2127 sooptmcopyout(struct sockopt *sopt, struct mbuf *m)
2128 {
2129 struct mbuf *m0 = m;
2130 size_t valsize = 0;
2131
2132 if (sopt->sopt_val == NULL)
2133 return 0;
2134 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
2135 if (sopt->sopt_p != NULL) {
2136 int error;
2137
2138 error = copyout(mtod(m, char *), sopt->sopt_val,
2139 m->m_len);
2140 if (error != 0) {
2141 m_freem(m0);
2142 return(error);
2143 }
2144 } else
2145 bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
2146 sopt->sopt_valsize -= m->m_len;
2147 (caddr_t)sopt->sopt_val += m->m_len;
2148 valsize += m->m_len;
2149 m = m->m_next;
2150 }
2151 if (m != NULL) {
2152 /* enough soopt buffer should be given from user-land */
2153 m_freem(m0);
2154 return(EINVAL);
2155 }
2156 sopt->sopt_valsize = valsize;
2157 return 0;
2158 }
2159