95f9fe7022378d20629759931c09b5b73757c1a2
[apple/xnu.git] / bsd / dev / i386 / systemcalls.c
1 /*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <kern/task.h>
29 #include <kern/thread.h>
30 #include <kern/assert.h>
31 #include <kern/clock.h>
32 #include <kern/locks.h>
33 #include <kern/sched_prim.h>
34 #include <kern/debug.h>
35 #include <mach/machine/thread_status.h>
36 #include <mach/thread_act.h>
37
38 #include <sys/kernel.h>
39 #include <sys/vm.h>
40 #include <sys/proc_internal.h>
41 #include <sys/syscall.h>
42 #include <sys/systm.h>
43 #include <sys/user.h>
44 #include <sys/errno.h>
45 #include <sys/kdebug.h>
46 #include <sys/sysent.h>
47 #include <sys/sysproto.h>
48 #include <sys/kauth.h>
49 #include <sys/systm.h>
50
51 #include <security/audit/audit.h>
52
53 #include <i386/seg.h>
54 #include <i386/machine_routines.h>
55 #include <mach/i386/syscall_sw.h>
56
57 #if CONFIG_DTRACE
58 extern int32_t dtrace_systrace_syscall(struct proc *, void *, int *);
59 extern void dtrace_systrace_syscall_return(unsigned short, int, int *);
60 #endif
61
62 extern void unix_syscall(x86_saved_state_t *);
63 extern void unix_syscall64(x86_saved_state_t *);
64 extern void *find_user_regs(thread_t);
65
66 extern void x86_toggle_sysenter_arg_store(thread_t thread, boolean_t valid);
67 extern boolean_t x86_sysenter_arg_store_isvalid(thread_t thread);
68
69 /* dynamically generated at build time based on syscalls.master */
70 extern const char *syscallnames[];
71
72 /*
73 * Function: unix_syscall
74 *
75 * Inputs: regs - pointer to i386 save area
76 *
77 * Outputs: none
78 */
79 void
80 unix_syscall(x86_saved_state_t *state)
81 {
82 thread_t thread;
83 void *vt;
84 unsigned int code;
85 struct sysent *callp;
86
87 int error;
88 vm_offset_t params;
89 struct proc *p;
90 struct uthread *uthread;
91 x86_saved_state32_t *regs;
92 boolean_t args_in_uthread;
93
94 assert(is_saved_state32(state));
95 regs = saved_state32(state);
96 #if DEBUG
97 if (regs->eax == 0x800)
98 thread_exception_return();
99 #endif
100 thread = current_thread();
101 uthread = get_bsdthread_info(thread);
102
103
104 /* Get the approriate proc; may be different from task's for vfork() */
105 if (!(uthread->uu_flag & UT_VFORK))
106 p = (struct proc *)get_bsdtask_info(current_task());
107 else
108 p = current_proc();
109
110 /* Verify that we are not being called from a task without a proc */
111 if (p == NULL) {
112 regs->eax = EPERM;
113 regs->efl |= EFL_CF;
114 task_terminate_internal(current_task());
115 thread_exception_return();
116 /* NOTREACHED */
117 }
118
119 code = regs->eax & I386_SYSCALL_NUMBER_MASK;
120 DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
121 code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
122 args_in_uthread = ((regs->eax & I386_SYSCALL_ARG_BYTES_MASK) != 0) && x86_sysenter_arg_store_isvalid(thread);
123 params = (vm_offset_t) (regs->uesp + sizeof (int));
124
125 regs->efl &= ~(EFL_CF);
126
127 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
128
129 if (callp == sysent) {
130 code = fuword(params);
131 params += sizeof(int);
132 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
133 }
134
135 vt = (void *)uthread->uu_arg;
136
137 if (callp->sy_arg_bytes != 0) {
138 sy_munge_t *mungerp;
139
140 assert((unsigned) callp->sy_arg_bytes <= sizeof (uthread->uu_arg));
141 if (!args_in_uthread)
142 {
143 uint32_t nargs;
144 nargs = callp->sy_arg_bytes;
145 error = copyin((user_addr_t) params, (char *) vt, nargs);
146 if (error) {
147 regs->eax = error;
148 regs->efl |= EFL_CF;
149 thread_exception_return();
150 /* NOTREACHED */
151 }
152 }
153
154 if (code != 180) {
155 int *ip = (int *)vt;
156
157 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
158 *ip, *(ip+1), *(ip+2), *(ip+3), 0);
159 }
160 mungerp = callp->sy_arg_munge32;
161
162 /*
163 * If non-NULL, then call the syscall argument munger to
164 * copy in arguments (see xnu/bsd/dev/{i386|x86_64}/munge.s); the
165 * first argument is NULL because we are munging in place
166 * after a copyin because the ABI currently doesn't use
167 * registers to pass system call arguments.
168 */
169 if (mungerp != NULL)
170 (*mungerp)(NULL, vt);
171 } else
172 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
173 0, 0, 0, 0, 0);
174
175 /*
176 * Delayed binding of thread credential to process credential, if we
177 * are not running with an explicitly set thread credential.
178 */
179 kauth_cred_uthread_update(uthread, p);
180
181 uthread->uu_rval[0] = 0;
182 uthread->uu_rval[1] = regs->edx;
183 uthread->uu_flag |= UT_NOTCANCELPT;
184
185
186 #ifdef JOE_DEBUG
187 uthread->uu_iocount = 0;
188 uthread->uu_vpindex = 0;
189 #endif
190
191 AUDIT_SYSCALL_ENTER(code, p, uthread);
192 error = (*(callp->sy_call))((void *) p, (void *) vt, &(uthread->uu_rval[0]));
193 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
194 #if CONFIG_MACF
195 mac_thread_userret(code, error, thread);
196 #endif
197
198 #ifdef JOE_DEBUG
199 if (uthread->uu_iocount)
200 printf("system call returned with uu_iocount != 0\n");
201 #endif
202 #if CONFIG_DTRACE
203 uthread->t_dtrace_errno = error;
204 #endif /* CONFIG_DTRACE */
205
206 if (error == ERESTART) {
207 /*
208 * Move the user's pc back to repeat the syscall:
209 * 5 bytes for a sysenter, or 2 for an int 8x.
210 * The SYSENTER_TF_CS covers single-stepping over a sysenter
211 * - see debug trap handler in idt.s/idt64.s
212 */
213
214 if (regs->cs == SYSENTER_CS || regs->cs == SYSENTER_TF_CS) {
215 regs->eip -= 5;
216 }
217 else
218 regs->eip -= 2;
219 }
220 else if (error != EJUSTRETURN) {
221 if (error) {
222 regs->eax = error;
223 regs->efl |= EFL_CF; /* carry bit */
224 } else { /* (not error) */
225 regs->eax = uthread->uu_rval[0];
226 regs->edx = uthread->uu_rval[1];
227 }
228 }
229
230 DEBUG_KPRINT_SYSCALL_UNIX(
231 "unix_syscall: error=%d retval=(%u,%u)\n",
232 error, regs->eax, regs->edx);
233
234 uthread->uu_flag &= ~UT_NOTCANCELPT;
235 #if DEBUG
236 /*
237 * if we're holding the funnel panic
238 */
239 syscall_exit_funnelcheck();
240 #endif /* DEBUG */
241 if (uthread->uu_lowpri_window) {
242 /*
243 * task is marked as a low priority I/O type
244 * and the I/O we issued while in this system call
245 * collided with normal I/O operations... we'll
246 * delay in order to mitigate the impact of this
247 * task on the normal operation of the system
248 */
249 throttle_lowpri_io(TRUE);
250 }
251 if (code != 180)
252 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
253 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
254
255
256 thread_exception_return();
257 /* NOTREACHED */
258 }
259
260
261 void
262 unix_syscall64(x86_saved_state_t *state)
263 {
264 thread_t thread;
265 unsigned int code;
266 struct sysent *callp;
267 void *uargp;
268 int args_in_regs;
269 int error;
270 struct proc *p;
271 struct uthread *uthread;
272 x86_saved_state64_t *regs;
273
274 assert(is_saved_state64(state));
275 regs = saved_state64(state);
276
277 if (regs->rax == 0x2000800)
278 thread_exception_return();
279
280 thread = current_thread();
281 uthread = get_bsdthread_info(thread);
282
283 /* Get the approriate proc; may be different from task's for vfork() */
284 if (!(uthread->uu_flag & UT_VFORK))
285 p = (struct proc *)get_bsdtask_info(current_task());
286 else
287 p = current_proc();
288
289 /* Verify that we are not being called from a task without a proc */
290 if (p == NULL) {
291 regs->rax = EPERM;
292 regs->isf.rflags |= EFL_CF;
293 task_terminate_internal(current_task());
294 thread_exception_return();
295 /* NOTREACHED */
296 }
297 args_in_regs = 6;
298
299 code = regs->rax & SYSCALL_NUMBER_MASK;
300 DEBUG_KPRINT_SYSCALL_UNIX(
301 "unix_syscall64: code=%d(%s) rip=%llx\n",
302 code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
303 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
304 uargp = (void *)(&regs->rdi);
305
306 if (callp == sysent) {
307 /*
308 * indirect system call... system call number
309 * passed as 'arg0'
310 */
311 code = regs->rdi;
312 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
313 uargp = (void *)(&regs->rsi);
314 args_in_regs = 5;
315 }
316
317 if (callp->sy_narg != 0) {
318 if (code != 180) {
319 uint64_t *ip = (uint64_t *)uargp;
320
321 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
322 (int)(*ip), (int)(*(ip+1)), (int)(*(ip+2)), (int)(*(ip+3)), 0);
323 }
324 assert(callp->sy_narg <= 8);
325
326 if (callp->sy_narg > args_in_regs) {
327 int copyin_count;
328
329 copyin_count = (callp->sy_narg - args_in_regs) * sizeof(uint64_t);
330
331 error = copyin((user_addr_t)(regs->isf.rsp + sizeof(user_addr_t)), (char *)&regs->v_arg6, copyin_count);
332 if (error) {
333 regs->rax = error;
334 regs->isf.rflags |= EFL_CF;
335 thread_exception_return();
336 /* NOTREACHED */
337 }
338 }
339 /*
340 * XXX Turn 64 bit unsafe calls into nosys()
341 */
342 if (callp->sy_flags & UNSAFE_64BIT) {
343 callp = &sysent[63];
344 goto unsafe;
345 }
346 } else
347 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_START,
348 0, 0, 0, 0, 0);
349 unsafe:
350
351 /*
352 * Delayed binding of thread credential to process credential, if we
353 * are not running with an explicitly set thread credential.
354 */
355 kauth_cred_uthread_update(uthread, p);
356
357 uthread->uu_rval[0] = 0;
358 uthread->uu_rval[1] = 0;
359
360
361 uthread->uu_flag |= UT_NOTCANCELPT;
362
363
364 AUDIT_SYSCALL_ENTER(code, p, uthread);
365 error = (*(callp->sy_call))((void *) p, uargp, &(uthread->uu_rval[0]));
366 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
367
368 #if CONFIG_DTRACE
369 uthread->t_dtrace_errno = error;
370 #endif /* CONFIG_DTRACE */
371
372 if (error == ERESTART) {
373 /*
374 * all system calls come through via the syscall instruction
375 * in 64 bit mode... its 2 bytes in length
376 * move the user's pc back to repeat the syscall:
377 */
378 regs->isf.rip -= 2;
379 }
380 else if (error != EJUSTRETURN) {
381 if (error) {
382 regs->rax = error;
383 regs->isf.rflags |= EFL_CF; /* carry bit */
384 } else { /* (not error) */
385
386 switch (callp->sy_return_type) {
387 case _SYSCALL_RET_INT_T:
388 regs->rax = uthread->uu_rval[0];
389 regs->rdx = uthread->uu_rval[1];
390 break;
391 case _SYSCALL_RET_UINT_T:
392 regs->rax = ((u_int)uthread->uu_rval[0]);
393 regs->rdx = ((u_int)uthread->uu_rval[1]);
394 break;
395 case _SYSCALL_RET_OFF_T:
396 case _SYSCALL_RET_ADDR_T:
397 case _SYSCALL_RET_SIZE_T:
398 case _SYSCALL_RET_SSIZE_T:
399 regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
400 regs->rdx = 0;
401 break;
402 case _SYSCALL_RET_NONE:
403 break;
404 default:
405 panic("unix_syscall: unknown return type");
406 break;
407 }
408 regs->isf.rflags &= ~EFL_CF;
409 }
410 }
411
412 DEBUG_KPRINT_SYSCALL_UNIX(
413 "unix_syscall64: error=%d retval=(%llu,%llu)\n",
414 error, regs->rax, regs->rdx);
415
416 uthread->uu_flag &= ~UT_NOTCANCELPT;
417
418 /*
419 * if we're holding the funnel panic
420 */
421 syscall_exit_funnelcheck();
422
423 if (uthread->uu_lowpri_window) {
424 /*
425 * task is marked as a low priority I/O type
426 * and the I/O we issued while in this system call
427 * collided with normal I/O operations... we'll
428 * delay in order to mitigate the impact of this
429 * task on the normal operation of the system
430 */
431 throttle_lowpri_io(TRUE);
432 }
433 if (code != 180)
434 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
435 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
436
437 thread_exception_return();
438 /* NOTREACHED */
439 }
440
441
442 void
443 unix_syscall_return(int error)
444 {
445 thread_t thread;
446 struct uthread *uthread;
447 struct proc *p;
448 unsigned int code;
449 vm_offset_t params;
450 struct sysent *callp;
451
452 thread = current_thread();
453 uthread = get_bsdthread_info(thread);
454
455
456 p = current_proc();
457
458 if (proc_is64bit(p)) {
459 x86_saved_state64_t *regs;
460
461 regs = saved_state64(find_user_regs(thread));
462
463 /* reconstruct code for tracing before blasting rax */
464 code = regs->rax & SYSCALL_NUMBER_MASK;
465 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
466
467 if (callp == sysent)
468 /*
469 * indirect system call... system call number
470 * passed as 'arg0'
471 */
472 code = regs->rdi;
473
474 #if CONFIG_DTRACE
475 if (callp->sy_call == dtrace_systrace_syscall)
476 dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
477 #endif /* CONFIG_DTRACE */
478 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
479
480 if (error == ERESTART) {
481 /*
482 * all system calls come through via the syscall instruction
483 * in 64 bit mode... its 2 bytes in length
484 * move the user's pc back to repeat the syscall:
485 */
486 regs->isf.rip -= 2;
487 }
488 else if (error != EJUSTRETURN) {
489 if (error) {
490 regs->rax = error;
491 regs->isf.rflags |= EFL_CF; /* carry bit */
492 } else { /* (not error) */
493
494 switch (callp->sy_return_type) {
495 case _SYSCALL_RET_INT_T:
496 regs->rax = uthread->uu_rval[0];
497 regs->rdx = uthread->uu_rval[1];
498 break;
499 case _SYSCALL_RET_UINT_T:
500 regs->rax = ((u_int)uthread->uu_rval[0]);
501 regs->rdx = ((u_int)uthread->uu_rval[1]);
502 break;
503 case _SYSCALL_RET_OFF_T:
504 case _SYSCALL_RET_ADDR_T:
505 case _SYSCALL_RET_SIZE_T:
506 case _SYSCALL_RET_SSIZE_T:
507 regs->rax = *((uint64_t *)(&uthread->uu_rval[0]));
508 regs->rdx = 0;
509 break;
510 case _SYSCALL_RET_NONE:
511 break;
512 default:
513 panic("unix_syscall: unknown return type");
514 break;
515 }
516 regs->isf.rflags &= ~EFL_CF;
517 }
518 }
519 DEBUG_KPRINT_SYSCALL_UNIX(
520 "unix_syscall_return: error=%d retval=(%llu,%llu)\n",
521 error, regs->rax, regs->rdx);
522 } else {
523 x86_saved_state32_t *regs;
524
525 regs = saved_state32(find_user_regs(thread));
526
527 regs->efl &= ~(EFL_CF);
528 /* reconstruct code for tracing before blasting eax */
529 code = regs->eax & I386_SYSCALL_NUMBER_MASK;
530 callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
531
532 #if CONFIG_DTRACE
533 if (callp->sy_call == dtrace_systrace_syscall)
534 dtrace_systrace_syscall_return( code, error, uthread->uu_rval );
535 #endif /* CONFIG_DTRACE */
536 AUDIT_SYSCALL_EXIT(code, p, uthread, error);
537
538 if (callp == sysent) {
539 params = (vm_offset_t) (regs->uesp + sizeof (int));
540 code = fuword(params);
541 }
542 if (error == ERESTART) {
543 regs->eip -= ((regs->cs & 0xffff) == SYSENTER_CS) ? 5 : 2;
544 }
545 else if (error != EJUSTRETURN) {
546 if (error) {
547 regs->eax = error;
548 regs->efl |= EFL_CF; /* carry bit */
549 } else { /* (not error) */
550 regs->eax = uthread->uu_rval[0];
551 regs->edx = uthread->uu_rval[1];
552 }
553 }
554 DEBUG_KPRINT_SYSCALL_UNIX(
555 "unix_syscall_return: error=%d retval=(%u,%u)\n",
556 error, regs->eax, regs->edx);
557 }
558
559
560 uthread->uu_flag &= ~UT_NOTCANCELPT;
561
562 /*
563 * if we're holding the funnel panic
564 */
565 syscall_exit_funnelcheck();
566
567 if (uthread->uu_lowpri_window) {
568 /*
569 * task is marked as a low priority I/O type
570 * and the I/O we issued while in this system call
571 * collided with normal I/O operations... we'll
572 * delay in order to mitigate the impact of this
573 * task on the normal operation of the system
574 */
575 throttle_lowpri_io(TRUE);
576 }
577 if (code != 180)
578 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_EXCP_SC, code) | DBG_FUNC_END,
579 error, uthread->uu_rval[0], uthread->uu_rval[1], p->p_pid, 0);
580
581 thread_exception_return();
582 /* NOTREACHED */
583 }
584
585 void
586 munge_wwwlww(
587 __unused const void *in32,
588 void *out64)
589 {
590 uint32_t *arg32;
591 uint64_t *arg64;
592
593 /* we convert in place in out64 */
594 arg32 = (uint32_t *) out64;
595 arg64 = (uint64_t *) out64;
596
597 arg64[5] = arg32[6]; /* wwwlwW */
598 arg64[4] = arg32[5]; /* wwwlWw */
599 arg32[7] = arg32[4]; /* wwwLww (hi) */
600 arg32[6] = arg32[3]; /* wwwLww (lo) */
601 arg64[2] = arg32[2]; /* wwWlww */
602 arg64[1] = arg32[1]; /* wWwlww */
603 arg64[0] = arg32[0]; /* Wwwlww */
604 }
605
606
607 void
608 munge_wwlwww(
609 __unused const void *in32,
610 void *out64)
611 {
612 uint32_t *arg32;
613 uint64_t *arg64;
614
615 /* we convert in place in out64 */
616 arg32 = (uint32_t *) out64;
617 arg64 = (uint64_t *) out64;
618
619 arg64[5] = arg32[6]; /* wwlwwW */
620 arg64[4] = arg32[5]; /* wwlwWw */
621 arg64[3] = arg32[4]; /* wwlWww */
622 arg32[5] = arg32[3]; /* wwLwww (hi) */
623 arg32[4] = arg32[2]; /* wwLwww (lo) */
624 arg64[1] = arg32[1]; /* wWlwww */
625 arg64[0] = arg32[0]; /* Wwlwww */
626 }
627