From c910b4d9d2451126ae3917b931cd4390c11e1d52 Mon Sep 17 00:00:00 2001 From: Apple Date: Mon, 11 May 2009 20:47:03 +0000 Subject: [PATCH] xnu-1228.12.14.tar.gz --- bsd/conf/MASTER | 1 + bsd/conf/Makefile.i386 | 1 - bsd/conf/Makefile.ppc | 1 - bsd/dev/dtrace/dtrace.c | 117 +- bsd/dev/dtrace/dtrace_ptss.c | 11 + bsd/dev/dtrace/fasttrap.c | 3 + bsd/dev/dtrace/lockstat.c | 5 + bsd/dev/dtrace/profile_prvd.c | 8 + bsd/dev/dtrace/sdt.c | 1 + bsd/dev/dtrace/systrace.c | 8 +- bsd/dev/i386/sysctl.c | 70 ++ bsd/dev/unix_startup.c | 20 +- bsd/hfs/hfs.h | 4 + bsd/hfs/hfs_encodings.c | 1 + bsd/hfs/hfs_link.c | 19 + bsd/hfs/hfs_vfsops.c | 64 +- bsd/hfs/hfs_vfsutils.c | 16 + bsd/kern/kdebug.c | 16 +- bsd/kern/kern_credential.c | 26 +- bsd/kern/kern_exec.c | 100 +- bsd/kern/kern_lockf.c | 78 +- bsd/kern/kern_mib.c | 2 - bsd/kern/kern_sysctl.c | 8 +- bsd/kern/kpi_socketfilter.c | 19 +- bsd/kern/mach_loader.c | 21 +- bsd/kern/pthread_synch.c | 6 + bsd/kern/uipc_mbuf.c | 1 - bsd/net/if.c | 87 +- bsd/net/if_var.h | 6 +- bsd/net/radix.c | 77 +- bsd/net/radix.h | 16 +- bsd/net/route.c | 1041 +++++++++++++---- bsd/net/route.h | 32 +- bsd/net/rtsock.c | 162 ++- bsd/netinet/in.c | 11 +- bsd/netinet/in.h | 10 +- bsd/netinet/in_arp.c | 42 +- bsd/netinet/in_gif.c | 8 +- bsd/netinet/in_pcb.c | 91 +- bsd/netinet/in_pcb.h | 16 +- bsd/netinet/in_rmx.c | 49 +- bsd/netinet/in_var.h | 6 +- bsd/netinet/ip_divert.c | 6 +- bsd/netinet/ip_dummynet.c | 9 +- bsd/netinet/ip_dummynet.h | 5 +- bsd/netinet/ip_flow.c | 2 + bsd/netinet/ip_fw2.c | 4 +- bsd/netinet/ip_fw2.h | 1 + bsd/netinet/ip_icmp.c | 23 +- bsd/netinet/ip_input.c | 15 +- bsd/netinet/ip_output.c | 492 ++++++-- bsd/netinet/ip_var.h | 19 +- bsd/netinet/kpi_ipfilter.c | 8 +- bsd/netinet/raw_ip.c | 18 +- bsd/netinet/tcp_input.c | 78 +- bsd/netinet/tcp_output.c | 18 +- bsd/netinet/tcp_subr.c | 42 +- bsd/netinet/tcp_timer.c | 11 +- bsd/netinet/tcp_usrreq.c | 6 +- bsd/netinet/tcp_var.h | 10 +- bsd/netinet/udp_usrreq.c | 25 +- bsd/netinet6/icmp6.c | 21 +- bsd/netinet6/in6_pcb.c | 4 +- bsd/netinet6/in6_rmx.c | 22 +- bsd/netinet6/ip6_fw.c | 4 +- bsd/netinet6/ip6_output.c | 3 + bsd/sys/disk.h | 2 + bsd/sys/dtrace.h | 2 + bsd/sys/fcntl.h | 1 + bsd/sys/lockf.h | 1 + bsd/sys/lockstat.h | 1 - bsd/sys/mbuf.h | 2 + bsd/sys/vnode.h | 1 + bsd/sys/vnode_internal.h | 4 +- bsd/vfs/kpi_vfs.c | 18 +- bsd/vfs/vfs_journal.c | 24 +- bsd/vfs/vfs_lookup.c | 16 +- bsd/vfs/vfs_subr.c | 7 +- bsd/vfs/vfs_syscalls.c | 20 +- bsd/vfs/vfs_vnops.c | 10 +- bsd/vfs/vfs_xattr.c | 24 +- config/Libkern.exports | 3 - config/Libkern.i386.exports | 3 + config/Libkern.ppc.exports | 4 +- config/Makefile | 4 +- config/MasterVersion | 2 +- config/System6.0.exports | 3 - config/System6.0.i386.exports | 3 + config/System6.0.ppc.exports | 3 + iokit/IOKit/IOBufferMemoryDescriptor.h | 8 +- iokit/IOKit/IOMemoryDescriptor.h | 16 +- iokit/Kernel/IOBufferMemoryDescriptor.cpp | 5 +- iokit/Kernel/IOCatalogue.cpp | 1 + iokit/Kernel/IOHibernateIO.cpp | 3 + iokit/Kernel/IOMemoryDescriptor.cpp | 10 +- iokit/Kernel/IOTimerEventSource.cpp | 6 +- iokit/Kernel/IOUserClient.cpp | 2 +- kgmacros | 184 +++ libkern/Makefile | 3 +- libkern/c++/OSMetaClass.cpp | 1 + libkern/kmod/Makefile.kmod | 64 +- libsa/catalogue.cpp | 24 - libsyscall/BSDmakefile | 2 +- libsyscall/Makefile | 12 +- libsyscall/Makefile.xbs | 6 +- libsyscall/mach/Makefile.inc | 6 +- makedefs/MakeInc.def | 40 +- makedefs/MakeInc.rule | 38 +- osfmk/conf/files.i386 | 4 + osfmk/i386/AT386/model_dep.c | 8 +- osfmk/i386/commpage/bcopy_sse3x.s | 2 +- osfmk/i386/commpage/bcopy_sse3x_64.s | 6 +- osfmk/i386/commpage/bcopy_sse42.s | 313 +++++ osfmk/i386/commpage/bcopy_sse42_64.s | 304 +++++ osfmk/i386/commpage/bzero_sse2.s | 2 +- osfmk/i386/commpage/bzero_sse2_64.s | 2 +- osfmk/i386/commpage/bzero_sse42.s | 153 +++ osfmk/i386/commpage/bzero_sse42_64.s | 149 +++ osfmk/i386/commpage/commpage_asm.s | 4 + .../commpage/commpage_mach_absolute_time.s | 4 + osfmk/i386/cpu_data.h | 10 +- osfmk/i386/cpu_threads.c | 180 ++- osfmk/i386/cpu_topology.c | 2 + osfmk/i386/cpu_topology.h | 3 + osfmk/i386/cpuid.c | 23 + osfmk/i386/cpuid.h | 8 + osfmk/i386/etimer.c | 67 +- osfmk/i386/i386_lock.s | 2 + osfmk/i386/i386_vm_init.c | 16 +- osfmk/i386/lapic.c | 89 +- osfmk/i386/lapic.h | 5 + osfmk/i386/machine_check.c | 123 +- osfmk/i386/machine_check.h | 41 +- osfmk/i386/machine_routines.c | 4 +- osfmk/i386/machine_routines_asm.s | 5 +- osfmk/i386/mp.c | 126 +- osfmk/i386/mp.h | 9 + osfmk/i386/mp_desc.c | 35 +- osfmk/i386/mp_events.h | 2 + osfmk/i386/pcb.c | 10 +- osfmk/i386/pmCPU.c | 87 +- osfmk/i386/pmCPU.h | 7 +- osfmk/i386/pmap.c | 16 +- osfmk/i386/proc_reg.h | 18 +- osfmk/i386/rtclock.c | 11 - osfmk/i386/rtclock.h | 2 + osfmk/i386/tsc.c | 37 +- osfmk/i386/tsc.h | 1 + osfmk/i386/vmx/vmx_cpu.c | 1 + osfmk/ipc/ipc_kmsg.h | 23 +- osfmk/ipc/ipc_mqueue.c | 12 +- osfmk/ipc/ipc_mqueue.h | 1 + osfmk/ipc/ipc_notify.c | 47 +- osfmk/ipc/ipc_right.c | 2 +- osfmk/ipc/mach_msg.c | 13 +- osfmk/kern/ast.c | 13 +- osfmk/kern/call_entry.h | 34 +- osfmk/kern/clock.h | 13 +- osfmk/kern/debug.c | 69 +- osfmk/kern/debug.h | 1 + osfmk/kern/hibernate.c | 55 +- osfmk/kern/host.c | 12 +- osfmk/kern/ipc_mig.c | 19 +- osfmk/kern/kmod.c | 419 ++++++- osfmk/kern/machine.c | 36 +- osfmk/kern/misc_protos.h | 4 + osfmk/kern/printf.c | 18 + osfmk/kern/priority.c | 3 +- osfmk/kern/processor.c | 64 +- osfmk/kern/processor.h | 42 +- osfmk/kern/processor_data.c | 4 +- osfmk/kern/processor_data.h | 7 +- osfmk/kern/sched.h | 33 +- osfmk/kern/sched_prim.c | 195 +-- osfmk/kern/stack.c | 3 +- osfmk/kern/startup.c | 4 +- osfmk/kern/syscall_subr.c | 4 +- osfmk/kern/thread_call.c | 908 ++++++-------- osfmk/kern/thread_call.h | 123 +- osfmk/kern/timer_call.c | 253 ++-- osfmk/kern/timer_call.h | 50 +- osfmk/kern/timer_queue.h | 70 ++ osfmk/kern/zalloc.c | 478 ++++++-- osfmk/mach/Makefile | 4 + osfmk/mach/kext_panic_report.h | 82 ++ osfmk/mach/kmod.h | 3 + osfmk/mach/machine.h | 2 + osfmk/mach/port.h | 1 + osfmk/ppc/Diagnostics.c | 4 +- osfmk/ppc/cpu.c | 11 +- osfmk/ppc/etimer.c | 42 +- osfmk/ppc/exception.h | 3 +- osfmk/ppc/machine_routines.c | 17 +- osfmk/ppc/model_dep.c | 3 +- osfmk/ppc/ppc_init.c | 2 + osfmk/ppc/rtclock.c | 17 +- osfmk/ppc/rtclock.h | 3 +- osfmk/vm/memory_object.c | 4 +- osfmk/vm/vm_apple_protect.c | 3 +- osfmk/vm/vm_fault.c | 41 +- osfmk/vm/vm_init.c | 3 + pexpert/i386/pe_misc.s | 1 + pexpert/pexpert/pexpert.h | 1 + security/mac_framework.h | 2 +- security/mac_policy.h | 7 +- security/mac_vfs.c | 9 +- 206 files changed, 6609 insertions(+), 2295 deletions(-) create mode 100644 osfmk/i386/commpage/bcopy_sse42.s create mode 100644 osfmk/i386/commpage/bcopy_sse42_64.s create mode 100644 osfmk/i386/commpage/bzero_sse42.s create mode 100644 osfmk/i386/commpage/bzero_sse42_64.s create mode 100644 osfmk/kern/timer_queue.h create mode 100644 osfmk/mach/kext_panic_report.h diff --git a/bsd/conf/MASTER b/bsd/conf/MASTER index 5419f96bb..9459048eb 100644 --- a/bsd/conf/MASTER +++ b/bsd/conf/MASTER @@ -165,6 +165,7 @@ options CONFIG_SOWUPCALL # SB_UPCALL on sowwakeup # options CONFIG_FORCE_OUT_IFP # Force IP output to use an interface # options CONFIG_MBUF_NOEXPAND # limit mbuf expansion # options CONFIG_MBUF_JUMBO # jumbo cluster pool # +options CONFIG_SCOPEDROUTING # scoped routing on by default # options CONFIG_IP_EDGEHOLE # Drop tagged packets at EDGE interface # options CONFIG_WORKQUEUE # diff --git a/bsd/conf/Makefile.i386 b/bsd/conf/Makefile.i386 index 07c022208..ec78b385c 100644 --- a/bsd/conf/Makefile.i386 +++ b/bsd/conf/Makefile.i386 @@ -50,7 +50,6 @@ OBJS_NO_WERROR = \ ip_fw2_compat.o \ kpi_ipfilter.o \ in_gif.o \ - in_pcb.o \ ip_divert.o \ ip_dummynet.o \ ip_icmp.o \ diff --git a/bsd/conf/Makefile.ppc b/bsd/conf/Makefile.ppc index ac870fd86..89d810966 100644 --- a/bsd/conf/Makefile.ppc +++ b/bsd/conf/Makefile.ppc @@ -51,7 +51,6 @@ OBJS_NO_WERROR = \ ip_fw2_compat.o \ kpi_ipfilter.o \ in_gif.o \ - in_pcb.o \ ip_divert.o \ ip_dummynet.o \ ip_icmp.o \ diff --git a/bsd/dev/dtrace/dtrace.c b/bsd/dev/dtrace/dtrace.c index eebffddbb..bdbe6a874 100644 --- a/bsd/dev/dtrace/dtrace.c +++ b/bsd/dev/dtrace/dtrace.c @@ -970,6 +970,8 @@ dtrace_priv_proc_common_zone(dtrace_state_t *state) return (0); #else +#pragma unused(state) + return 1; /* Darwin doesn't do zones. */ #endif /* __APPLE__ */ } @@ -1124,7 +1126,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate) dtrace_dstate_percpu_t *dcpu; int i, work = 0; - for (i = 0; i < NCPU; i++) { + for (i = 0; i < (int)NCPU; i++) { dcpu = &dstate->dtds_percpu[i]; ASSERT(dcpu->dtdsc_rinsing == NULL); @@ -1174,7 +1176,7 @@ dtrace_dynvar_clean(dtrace_dstate_t *dstate) dtrace_sync(); - for (i = 0; i < NCPU; i++) { + for (i = 0; i < (int)NCPU; i++) { dcpu = &dstate->dtds_percpu[i]; if (dcpu->dtdsc_rinsing == NULL) @@ -1519,7 +1521,7 @@ retry: case DTRACE_DSTATE_CLEAN: { void *sp = &dstate->dtds_state; - if (++cpu >= NCPU) + if (++cpu >= (int)NCPU) cpu = 0; if (dcpu->dtdsc_dirty != NULL && @@ -1667,6 +1669,7 @@ retry: static void dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg) { +#pragma unused(arg) if (nval < *oval) *oval = nval; } @@ -1675,6 +1678,7 @@ dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg) static void dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg) { +#pragma unused(arg) if (nval > *oval) *oval = nval; } @@ -1744,6 +1748,7 @@ dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr) static void dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) { +#pragma unused(arg) data[0]++; data[1] += nval; } @@ -1752,6 +1757,7 @@ dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg) static void dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg) { +#pragma unused(nval,arg) *oval = *oval + 1; } @@ -1759,6 +1765,7 @@ dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg) static void dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg) { +#pragma unused(arg) *oval += nval; } @@ -1773,6 +1780,7 @@ static void dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf, intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg) { +#pragma unused(arg) dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec; uint32_t i, ndx, size, fsize; uint32_t align = sizeof (uint64_t) - 1; @@ -3532,7 +3540,7 @@ dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs, * string -- setting a bit in the map for every character * found in the token string. */ - for (i = 0; i < sizeof (tokmap); i++) + for (i = 0; i < (int)sizeof (tokmap); i++) tokmap[i] = 0; for (; tokaddr < toklimit; tokaddr++) { @@ -4578,7 +4586,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, size_t sz = v->dtdv_type.dtdt_size; sz += sizeof (uint64_t); - ASSERT(svar->dtsv_size == NCPU * sz); + ASSERT(svar->dtsv_size == (int)NCPU * sz); a += CPU->cpu_id * sz; if (*(uint8_t *)a == UINT8_MAX) { @@ -4595,7 +4603,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, break; } - ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t)); + ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t)); tmp = (uint64_t *)(uintptr_t)svar->dtsv_data; regs[rd] = tmp[CPU->cpu_id]; break; @@ -4617,7 +4625,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, size_t sz = v->dtdv_type.dtdt_size; sz += sizeof (uint64_t); - ASSERT(svar->dtsv_size == NCPU * sz); + ASSERT(svar->dtsv_size == (int)NCPU * sz); a += CPU->cpu_id * sz; if (regs[rd] == NULL) { @@ -4633,7 +4641,7 @@ dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate, break; } - ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t)); + ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t)); tmp = (uint64_t *)(uintptr_t)svar->dtsv_data; tmp[CPU->cpu_id] = regs[rd]; break; @@ -5403,7 +5411,7 @@ __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1, #ifdef lint uint64_t val = 0; #else - uint64_t val; + uint64_t val = 0; #endif mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE; @@ -6535,6 +6543,7 @@ dtrace_match_string(const char *s, const char *p, int depth) static int dtrace_match_nul(const char *s, const char *p, int depth) { +#pragma unused(s,p,depth) return (1); /* always match the empty pattern */ } @@ -6542,6 +6551,7 @@ dtrace_match_nul(const char *s, const char *p, int depth) static int dtrace_match_nonzero(const char *s, const char *p, int depth) { +#pragma unused(p,depth) return (s != NULL && s[0] != '\0'); } @@ -7296,7 +7306,6 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) } do { - kmod_info_t *ktl; /* * First, call the blanket provide operation. */ @@ -7322,10 +7331,10 @@ dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv) lck_mtx_unlock(&mod_lock); #else -#if 0 /* XXX Workaround for PR_4643546 XXX */ +#if 0 /* FIXME: Workaround for PR_4643546 */ simple_lock(&kmod_lock); - ktl = kmod; + kmod_info_t *ktl = kmod; while (ktl) { prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ktl); ktl = ktl->next; @@ -8561,10 +8570,10 @@ dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate) svarp = &vstate->dtvs_locals; if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) - dsize = NCPU * (v->dtdv_type.dtdt_size + + dsize = (int)NCPU * (v->dtdv_type.dtdt_size + sizeof (uint64_t)); else - dsize = NCPU * sizeof (uint64_t); + dsize = (int)NCPU * sizeof (uint64_t); break; @@ -9100,7 +9109,7 @@ dtrace_ecb_resize(dtrace_ecb_t *ecb) */ diff = offs + sizeof (dtrace_aggid_t); - if (diff = (diff & (sizeof (uint64_t) - 1))) + if ((diff = (diff & (sizeof (uint64_t) - 1)))) offs += sizeof (uint64_t) - diff; aggbase = offs - sizeof (dtrace_aggid_t); @@ -9795,12 +9804,12 @@ dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe, * of creating our own (saving both time and space). */ dtrace_ecb_t *cached = dtrace_ecb_create_cache; - dtrace_action_t *act = cached->dte_action; + dtrace_action_t *act_if = cached->dte_action; - if (act != NULL) { - ASSERT(act->dta_refcnt > 0); - act->dta_refcnt++; - ecb->dte_action = act; + if (act_if != NULL) { + ASSERT(act_if->dta_refcnt > 0); + act_if->dta_refcnt++; + ecb->dte_action = act_if; ecb->dte_action_last = cached->dte_action_last; ecb->dte_needed = cached->dte_needed; ecb->dte_size = cached->dte_size; @@ -9961,7 +9970,7 @@ dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags, return (EFBIG); #if defined(__APPLE__) - if (size > (sane_size / 8) / NCPU) /* As in kdbg_set_nkdbufs(), roughly. */ + if (size > (sane_size / 8) / (int)NCPU) /* As in kdbg_set_nkdbufs(), roughly. */ return (ENOMEM); #endif /* __APPLE__ */ @@ -10056,7 +10065,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, intptr_t offs = buf->dtb_offset, soffs; intptr_t woffs; caddr_t tomax; - size_t total; + size_t total_off; if (buf->dtb_flags & DTRACEBUF_INACTIVE) return (-1); @@ -10100,7 +10109,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, goto out; } - total = needed + (offs & (align - 1)); + total_off = needed + (offs & (align - 1)); /* * For a ring buffer, life is quite a bit more complicated. Before @@ -10109,15 +10118,15 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, * is required.) */ if ((buf->dtb_flags & DTRACEBUF_WRAPPED) || - offs + total > buf->dtb_size) { + offs + total_off > buf->dtb_size) { woffs = buf->dtb_xamot_offset; - if (offs + total > buf->dtb_size) { + if (offs + total_off > buf->dtb_size) { /* * We can't fit in the end of the buffer. First, a * sanity check that we can fit in the buffer at all. */ - if (total > buf->dtb_size) { + if (total_off > buf->dtb_size) { dtrace_buffer_drop(buf); return (-1); } @@ -10160,7 +10169,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, * that the top of the buffer is aligned. */ offs = 0; - total = needed; + total_off = needed; buf->dtb_flags |= DTRACEBUF_WRAPPED; } else { /* @@ -10186,7 +10195,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, } } - while (offs + total > woffs) { + while (offs + total_off > woffs) { dtrace_epid_t epid = *(uint32_t *)(tomax + woffs); size_t size; @@ -10226,7 +10235,7 @@ dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align, if (offs == 0) { buf->dtb_flags &= ~DTRACEBUF_WRAPPED; buf->dtb_offset = 0; - woffs = total; + woffs = total_off; while (woffs < buf->dtb_size) tomax[woffs++] = 0; @@ -10333,7 +10342,7 @@ dtrace_buffer_free(dtrace_buffer_t *bufs) { int i; - for (i = 0; i < NCPU; i++) { + for (i = 0; i < (int)NCPU; i++) { dtrace_buffer_t *buf = &bufs[i]; if (buf->dtb_tomax == NULL) { @@ -10714,7 +10723,7 @@ static int dtrace_enabling_matchstate(dtrace_state_t *state, int *nmatched) { dtrace_enabling_t *enab; - int matched, total = 0, err; + int matched, total_matched = 0, err; lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -10728,11 +10737,11 @@ dtrace_enabling_matchstate(dtrace_state_t *state, int *nmatched) if ((err = dtrace_enabling_match(enab, &matched)) != 0) return (err); - total += matched; + total_matched += matched; } if (nmatched != NULL) - *nmatched = total; + *nmatched = total_matched; return (0); } @@ -10824,6 +10833,7 @@ dtrace_enabling_provide(dtrace_provider_t *prv) static void dtrace_dof_error(dof_hdr_t *dof, const char *str) { +#pragma unused(dof) if (dtrace_err_verbose) cmn_err(CE_WARN, "failed to process DOF: %s", str); @@ -11155,7 +11165,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, size_t ttl = 0; dof_difohdr_t *dofd; uintptr_t daddr = (uintptr_t)dof; - size_t max = dtrace_difo_maxsize; + size_t max_size = dtrace_difo_maxsize; int i, l, n; static const struct { @@ -11220,7 +11230,7 @@ dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate, dofd->dofd_links[l])) == NULL) goto err; /* invalid section link */ - if (ttl + subsec->dofs_size > max) { + if (ttl + subsec->dofs_size > max_size) { dtrace_dof_error(dof, "exceeds maximum size"); goto err; } @@ -11887,7 +11897,7 @@ static int dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) { - size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize; + size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize; void *base; uintptr_t limit; dtrace_dynvar_t *dvar, *next, *start; @@ -11901,8 +11911,8 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) if ((dstate->dtds_chunksize = chunksize) == 0) dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE; - if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t))) - size = min; + if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t))) + size = min_size; if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL) return (ENOMEM); @@ -11910,7 +11920,7 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) dstate->dtds_size = size; dstate->dtds_base = base; dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP); - bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t)); + bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t)); hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)); @@ -11941,10 +11951,10 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t)); limit = (uintptr_t)base + size; - maxper = (limit - (uintptr_t)start) / NCPU; + maxper = (limit - (uintptr_t)start) / (int)NCPU; maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize; - for (i = 0; i < NCPU; i++) { + for (i = 0; i < (int)NCPU; i++) { dstate->dtds_percpu[i].dtdsc_free = dvar = start; /* @@ -11954,7 +11964,7 @@ dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size) * whatever is left over. In either case, we set the limit to * be the limit of the dynamic variable space. */ - if (maxper == 0 || i == NCPU - 1) { + if (maxper == 0 || i == (int)NCPU - 1) { limit = (uintptr_t)base + size; start = NULL; } else { @@ -12071,7 +12081,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr) char c[30]; dtrace_state_t *state; dtrace_optval_t *opt; - int bufsize = NCPU * sizeof (dtrace_buffer_t), i; + int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i; lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); @@ -12310,7 +12320,7 @@ static int dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which) { dtrace_optval_t *opt = state->dts_options, size; - processorid_t cpu; + processorid_t cpu = 0; int flags = 0, rval; lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED); @@ -12430,7 +12440,7 @@ dtrace_state_go(dtrace_state_t *state, processorid_t *cpu) dtrace_buffer_t *buf; cyc_handler_t hdlr; cyc_time_t when; - int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t); + int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t); dtrace_icookie_t cookie; lck_mtx_lock(&cpu_lock); @@ -12808,7 +12818,7 @@ dtrace_state_destroy(dtrace_state_t *state) dtrace_ecb_t *ecb; dtrace_vstate_t *vstate = &state->dts_vstate; minor_t minor = getminor(state->dts_dev); - int i, bufsize = NCPU * sizeof (dtrace_buffer_t); + int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t); dtrace_speculation_t *spec = state->dts_speculations; int nspec = state->dts_nspeculations; uint32_t match; @@ -13100,7 +13110,7 @@ dtrace_helper_trace(dtrace_helper_action_t *helper, if ((svar = vstate->dtvs_locals[i]) == NULL) continue; - ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t)); + ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t)); ent->dtht_locals[i] = ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id]; } @@ -13113,7 +13123,7 @@ dtrace_helper(int which, dtrace_mstate_t *mstate, uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; uint64_t sarg0 = mstate->dtms_arg[0]; uint64_t sarg1 = mstate->dtms_arg[1]; - uint64_t rval; + uint64_t rval = 0; dtrace_helpers_t *helpers = curproc->p_dtrace_helpers; dtrace_helper_action_t *helper; dtrace_vstate_t *vstate; @@ -13262,7 +13272,7 @@ dtrace_helper_destroygen(proc_t* p, int gen) * given generation number. */ for (;;) { - dtrace_helper_provider_t *prov; + dtrace_helper_provider_t *prov = NULL; /* * Look for a helper provider with the right generation. We @@ -14840,7 +14850,7 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) 1, INT_MAX, 0); dtrace_state_cache = kmem_cache_create("dtrace_state_cache", - sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN, + sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED); @@ -15075,6 +15085,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) static int dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) { +#pragma unused(flag,otyp,cred_p) minor_t minor = getminor(dev); dtrace_state_t *state; @@ -15294,6 +15305,8 @@ dtrace_ioctl_helper(int cmd, caddr_t arg, int *rv) static int dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) { +#pragma unused(md) + minor_t minor = getminor(dev); dtrace_state_t *state; int rval; @@ -15798,7 +15811,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) if (copyin((void *)arg, &desc, sizeof (desc)) != 0) return (EFAULT); - if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU) + if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= (int)NCPU) return (EINVAL); lck_mtx_lock(&dtrace_lock); @@ -15964,7 +15977,7 @@ dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) nerrs = state->dts_errors; dstate = &state->dts_vstate.dtvs_dynvars; - for (i = 0; i < NCPU; i++) { + for (i = 0; i < (int)NCPU; i++) { dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i]; stat.dtst_dyndrops += dcpu->dtdsc_drops; diff --git a/bsd/dev/dtrace/dtrace_ptss.c b/bsd/dev/dtrace/dtrace_ptss.c index 8e2ec272e..f4503c9ef 100644 --- a/bsd/dev/dtrace/dtrace_ptss.c +++ b/bsd/dev/dtrace/dtrace_ptss.c @@ -161,6 +161,16 @@ dtrace_ptss_allocate_page(struct proc* p) mach_vm_address_t addr = 0LL; mach_vm_size_t size = PAGE_SIZE; // We need some way to assert that this matches vm_map_round_page() !!! +#if CONFIG_EMBEDDED + /* The embedded OS has extra permissions for writable and executable pages. We can't pass in the flags + * we need for the correct permissions from mach_vm_allocate, so need to call mach_vm_map directly. */ + vm_map_offset_t map_addr = 0; + kern_return_t kr = mach_vm_map(map, &map_addr, size, 0, VM_FLAGS_ANYWHERE, IPC_PORT_NULL, 0, FALSE, VM_PROT_READ|VM_PROT_EXECUTE, VM_PROT_READ|VM_PROT_EXECUTE, VM_INHERIT_DEFAULT); + if (kr != KERN_SUCCESS) { + goto err; + } + addr = map_addr; +#else kern_return_t kr = mach_vm_allocate(map, &addr, size, VM_FLAGS_ANYWHERE); if (kr != KERN_SUCCESS) { goto err; @@ -171,6 +181,7 @@ dtrace_ptss_allocate_page(struct proc* p) mach_vm_deallocate(map, addr, size); goto err; } +#endif // Chain the page entries. int i; diff --git a/bsd/dev/dtrace/fasttrap.c b/bsd/dev/dtrace/fasttrap.c index b0828e6dc..3cb1b62e6 100644 --- a/bsd/dev/dtrace/fasttrap.c +++ b/bsd/dev/dtrace/fasttrap.c @@ -1771,6 +1771,7 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata) tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc; tp->ftt_pid = pdata->ftps_pid; + pp->ftp_tps[0].fit_tp = tp; pp->ftp_tps[0].fit_id.fti_probe = pp; #if defined(__APPLE__) @@ -2368,6 +2369,8 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) * Yes, this is a WAG. */ fasttrap_max = (sane_size >> 28) * 100000; + if (fasttrap_max == 0) + fasttrap_max = 50000; #endif fasttrap_total = 0; diff --git a/bsd/dev/dtrace/lockstat.c b/bsd/dev/dtrace/lockstat.c index 3c5602be9..82539d98b 100644 --- a/bsd/dev/dtrace/lockstat.c +++ b/bsd/dev/dtrace/lockstat.c @@ -183,6 +183,7 @@ vm_offset_t *assembly_probes[] = { */ void lockstat_hot_patch(boolean_t active) { +#pragma unused(active) int i; @@ -224,6 +225,7 @@ static dtrace_provider_id_t lockstat_id; static void lockstat_enable(void *arg, dtrace_id_t id, void *parg) { +#pragma unused(arg) lockstat_probe_t *probe = parg; ASSERT(!lockstat_probemap[probe->lsp_probe]); @@ -243,6 +245,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg) static void lockstat_disable(void *arg, dtrace_id_t id, void *parg) { +#pragma unused(arg,id) lockstat_probe_t *probe = parg; int i; @@ -272,6 +275,7 @@ lockstat_disable(void *arg, dtrace_id_t id, void *parg) static void lockstat_provide(void *arg, const dtrace_probedesc_t *desc) { +#pragma unused(arg,desc) int i = 0; for (i = 0; lockstat_probes[i].lsp_func != NULL; i++) { @@ -293,6 +297,7 @@ lockstat_provide(void *arg, const dtrace_probedesc_t *desc) static void lockstat_destroy(void *arg, dtrace_id_t id, void *parg) { +#pragma unused(arg,id) lockstat_probe_t *probe = parg; ASSERT(!lockstat_probemap[probe->lsp_probe]); diff --git a/bsd/dev/dtrace/profile_prvd.c b/bsd/dev/dtrace/profile_prvd.c index 14895f8d9..cd561c2df 100644 --- a/bsd/dev/dtrace/profile_prvd.c +++ b/bsd/dev/dtrace/profile_prvd.c @@ -206,6 +206,7 @@ profile_fire(void *arg) CPU->cpu_profile_upc, late, 0, 0); #else #if defined(__ppc__) || defined(__ppc64__) + { struct savearea *sv = find_kern_regs(current_thread()); if (sv) { @@ -218,7 +219,9 @@ profile_fire(void *arg) dtrace_probe(prof->prof_id, 0xcafebabe, 0x0, late, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ } + } #elif defined(__i386__) || defined(__x86_64__) + { x86_saved_state32_t *kern_regs = find_kern_regs(current_thread()); if (NULL != kern_regs) { @@ -242,6 +245,7 @@ profile_fire(void *arg) dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0); } } + } #else #error Unknown architecture #endif @@ -258,6 +262,7 @@ profile_tick(void *arg) CPU->cpu_profile_upc, 0, 0, 0); #else #if defined(__ppc__) || defined(__ppc64__) + { struct savearea *sv = find_kern_regs(current_thread()); if (sv) { @@ -270,7 +275,9 @@ profile_tick(void *arg) dtrace_probe(prof->prof_id, 0xcafebabe, 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */ } + } #elif defined(__i386__) || defined(__x86_64__) + { x86_saved_state32_t *kern_regs = find_kern_regs(current_thread()); if (NULL != kern_regs) { @@ -294,6 +301,7 @@ profile_tick(void *arg) dtrace_probe(prof->prof_id, 0x0, regs->eip, 0, 0, 0); } } + } #else #error Unknown architecture #endif diff --git a/bsd/dev/dtrace/sdt.c b/bsd/dev/dtrace/sdt.c index 640bfae34..946c6a4c6 100644 --- a/bsd/dev/dtrace/sdt.c +++ b/bsd/dev/dtrace/sdt.c @@ -657,6 +657,7 @@ void sdt_provide_module(void *arg, struct modctl *ctl) { #pragma unused(ctl) +#pragma unused(arg) __sdt_provide_module(arg, &g_sdt_kernctl); sdt_probedesc_t *sdpd = g_sdt_mach_module.sdt_probes; diff --git a/bsd/dev/dtrace/systrace.c b/bsd/dev/dtrace/systrace.c index 35601e943..52362b640 100644 --- a/bsd/dev/dtrace/systrace.c +++ b/bsd/dev/dtrace/systrace.c @@ -161,8 +161,12 @@ dtrace_systrace_syscall(struct proc *pp, void *uap, int *rv) // Bounds "check" the value of code a la unix_syscall sy = (code >= NUM_SYSENT) ? &systrace_sysent[63] : &systrace_sysent[code]; - if ((id = sy->stsy_entry) != DTRACE_IDNONE) - (*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4)); + if ((id = sy->stsy_entry) != DTRACE_IDNONE) { + if (ip) + (*systrace_probe)(id, *ip, *(ip+1), *(ip+2), *(ip+3), *(ip+4)); + else + (*systrace_probe)(id, 0, 0, 0, 0, 0); + } #if 0 /* XXX */ /* diff --git a/bsd/dev/i386/sysctl.c b/bsd/dev/i386/sysctl.c index 2637c2654..3cb481c40 100644 --- a/bsd/dev/i386/sysctl.c +++ b/bsd/dev/i386/sysctl.c @@ -101,6 +101,59 @@ hw_cpu_logical_per_package SYSCTL_HANDLER_ARGS sizeof(cpu_info->cpuid_logical_per_package)); } +static int +hw_cpu_sysctl_nehalem SYSCTL_HANDLER_ARGS +{ + i386_cpu_info_t *cpu_info = cpuid_info(); + + if (cpu_info->cpuid_model != 26) + return ENOENT; + + hw_cpu_sysctl(oidp, arg1, arg2, req); +} + +static int +hw_cpu_flex_ratio_desired SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + i386_cpu_info_t *cpu_info = cpuid_info(); + + if (cpu_info->cpuid_model != 26) + return ENOENT; + + return SYSCTL_OUT(req, &flex_ratio, sizeof(flex_ratio)); +} + +static int +hw_cpu_flex_ratio_min SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + i386_cpu_info_t *cpu_info = cpuid_info(); + + if (cpu_info->cpuid_model != 26) + return ENOENT; + + return SYSCTL_OUT(req, &flex_ratio_min, sizeof(flex_ratio_min)); +} + +static int +hw_cpu_flex_ratio_max SYSCTL_HANDLER_ARGS +{ + __unused struct sysctl_oid *unused_oidp = oidp; + __unused void *unused_arg1 = arg1; + __unused int unused_arg2 = arg2; + i386_cpu_info_t *cpu_info = cpuid_info(); + + if (cpu_info->cpuid_model != 26) + return ENOENT; + + return SYSCTL_OUT(req, &flex_ratio_max, sizeof(flex_ratio_max)); +} + SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "CPU info"); @@ -353,6 +406,23 @@ SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count, sizeof(uint32_t), hw_cpu_sysctl, "I", "Number of enabled threads per package"); +SYSCTL_NODE(_machdep_cpu, OID_AUTO, flex_ratio, CTLFLAG_RW|CTLFLAG_LOCKED, 0, + "Flex ratio"); + +SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, desired, + CTLTYPE_INT | CTLFLAG_RD, + 0, 0, + hw_cpu_flex_ratio_desired, "I", "Flex ratio desired (0 disabled)"); + +SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, min, + CTLTYPE_INT | CTLFLAG_RD, + 0, 0, + hw_cpu_flex_ratio_min, "I", "Flex ratio min (efficiency)"); + +SYSCTL_PROC(_machdep_cpu_flex_ratio, OID_AUTO, max, + CTLTYPE_INT | CTLFLAG_RD, + 0, 0, + hw_cpu_flex_ratio_max, "I", "Flex ratio max (non-turbo)"); uint64_t pmap_pv_hashlist_walks; uint64_t pmap_pv_hashlist_cnts; diff --git a/bsd/dev/unix_startup.c b/bsd/dev/unix_startup.c index d2dd20b11..1522646ea 100644 --- a/bsd/dev/unix_startup.c +++ b/bsd/dev/unix_startup.c @@ -46,6 +46,7 @@ #include #include #include +#include extern vm_map_t mb_map; @@ -81,6 +82,7 @@ SYSCTL_INT (_kern, OID_AUTO, maxnbuf, CTLFLAG_RW, &max_nbuf_headers, 0, ""); __private_extern__ int customnbuf = 0; int srv = 0; /* Flag indicates a server boot when set */ int ncl = 0; +static unsigned int mbuf_poolsz; vm_map_t buffer_map; vm_map_t bufferhdr_map; @@ -209,6 +211,9 @@ bsd_bufferinit(void) bufinit(); } +/* 512 MB hard limit on size of the mbuf pool */ +#define MAX_MBUF_POOL (512 << MBSHIFT) +#define MAX_NCL (MAX_MBUF_POOL >> MCLSHIFT) /* * this has been broken out into a separate routine that @@ -220,8 +225,13 @@ bsd_bufferinit(void) int bsd_mbuf_cluster_reserve(void) { - if (sane_size > (64 * 1024 * 1024) || ncl) { + /* If called more than once, return the previously calculated size */ + if (mbuf_poolsz != 0) + goto done; + + PE_parse_boot_argn("ncl", &ncl, sizeof (ncl)); + if (sane_size > (64 * 1024 * 1024) || ncl) { if ((nmbclusters = ncl) == 0) { if ((nmbclusters = ((sane_size / 16)/MCLBYTES)) > 32768) nmbclusters = 32768; @@ -229,7 +239,13 @@ bsd_mbuf_cluster_reserve(void) /* Make sure it's not odd in case ncl is manually set */ if (nmbclusters & 0x1) --nmbclusters; - } + /* And obey the upper limit */ + if (nmbclusters > MAX_NCL) + nmbclusters = MAX_NCL; + + } + mbuf_poolsz = nmbclusters << MCLSHIFT; +done: return (nmbclusters * MCLBYTES); } diff --git a/bsd/hfs/hfs.h b/bsd/hfs/hfs.h index cfed9e65d..f3c12bb41 100644 --- a/bsd/hfs/hfs.h +++ b/bsd/hfs/hfs.h @@ -266,6 +266,7 @@ typedef struct hfsmount { lck_mtx_t hfs_mutex; /* protects access to hfsmount data */ void *hfs_freezing_proc; /* who froze the fs */ + void *hfs_downgrading_proc; /* process who's downgrading to rdonly */ lck_rw_t hfs_insync; /* protects sync/freeze interaction */ /* Resize variables: */ @@ -341,6 +342,9 @@ enum privdirtype {FILE_HARDLINKS, DIR_HARDLINKS}; #define HFS_VIRTUAL_DEVICE 0x20000 /* When set, we're in hfs_changefs, so hfs_sync should do nothing. */ #define HFS_IN_CHANGEFS 0x40000 +/* When set, we are in process of downgrading or have downgraded to read-only, + * so hfs_start_transaction should return EROFS. */ +#define HFS_RDONLY_DOWNGRADE 0x80000 /* Macro to update next allocation block in the HFS mount structure. If diff --git a/bsd/hfs/hfs_encodings.c b/bsd/hfs/hfs_encodings.c index d0e89e8d8..c531aa28b 100644 --- a/bsd/hfs/hfs_encodings.c +++ b/bsd/hfs/hfs_encodings.c @@ -211,6 +211,7 @@ hfs_relconverter(u_int32_t encoding) lck_mtx_unlock(&encodinglst_mutex); FREE(encp, M_TEMP); + record_kext_unload(id); kmod_destroy((host_priv_t) host_priv_self(), id); return (0); } diff --git a/bsd/hfs/hfs_link.c b/bsd/hfs/hfs_link.c index f6c5e8409..ba47918e3 100644 --- a/bsd/hfs/hfs_link.c +++ b/bsd/hfs/hfs_link.c @@ -438,6 +438,25 @@ hfs_vnop_link(struct vnop_link_args *ap) } tdcp = VTOC(tdvp); cp = VTOC(vp); + + /* + * Make sure we don't race the src or dst parent directories with rmdir. + * Note that we should only have a src parent directory cnode lock + * if we're dealing with a directory hardlink here. + */ + if (fdcp) { + if (fdcp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + } + + if (tdcp->c_flag & (C_NOEXISTS | C_DELETED)) { + error = ENOENT; + goto out; + } + + /* Check src for errors: too many links, immutable, race with unlink */ if (cp->c_linkcount >= HFS_LINK_MAX) { error = EMLINK; goto out; diff --git a/bsd/hfs/hfs_vfsops.c b/bsd/hfs/hfs_vfsops.c index 8eac4e20e..b2e71a034 100644 --- a/bsd/hfs/hfs_vfsops.c +++ b/bsd/hfs/hfs_vfsops.c @@ -220,17 +220,32 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte vfs_isrdonly(mp)) { int flags; + /* Set flag to indicate that a downgrade to read-only + * is in progress and therefore block any further + * modifications to the file system. + */ + hfs_global_exclusive_lock_acquire(hfsmp); + hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_proc = current_thread(); + hfs_global_exclusive_lock_release(hfsmp); + /* use VFS_SYNC to push out System (btree) files */ retval = VFS_SYNC(mp, MNT_WAIT, context); - if (retval && ((cmdflags & MNT_FORCE) == 0)) + if (retval && ((cmdflags & MNT_FORCE) == 0)) { + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_proc = NULL; goto out; + } flags = WRITECLOSE; if (cmdflags & MNT_FORCE) flags |= FORCECLOSE; - if ((retval = hfs_flushfiles(mp, flags, p))) + if ((retval = hfs_flushfiles(mp, flags, p))) { + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_proc = NULL; goto out; + } /* mark the volume cleanly unmounted */ hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask; @@ -248,6 +263,8 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte } } if (retval) { + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_proc = NULL; hfsmp->hfs_flags &= ~HFS_READ_ONLY; goto out; } @@ -263,6 +280,8 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte hfs_global_exclusive_lock_release(hfsmp); } + + hfsmp->hfs_downgrading_proc = NULL; } /* Change to a writable file system. */ @@ -317,6 +336,13 @@ hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t conte /* Only clear HFS_READ_ONLY after a successfull write */ hfsmp->hfs_flags &= ~HFS_READ_ONLY; + /* If this mount point was downgraded from read-write + * to read-only, clear that information as we are now + * moving back to read-write. + */ + hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE; + hfsmp->hfs_downgrading_proc = NULL; + /* mark the volume dirty (clear clean unmount bit) */ hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask; @@ -885,8 +911,13 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, * block size to be 4k if there are more than 31-bits * worth of blocks but to insure compatibility with * pre-Tiger systems we have to do it. + * + * If the device size is not a multiple of 4K (8 * 512), then + * switching the logical block size isn't going to help because + * we will be unable to write the alternate volume header. + * In this case, just leave the logical block size unchanged. */ - if (log_blkcnt > 0x000000007fffffff) { + if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) { minblksize = log_blksize = 4096; if (phys_blksize < log_blksize) phys_blksize = log_blksize; @@ -1024,6 +1055,8 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, } hfsmp->hfs_logical_block_size = log_blksize; hfsmp->hfs_logical_block_count = log_blkcnt; + hfsmp->hfs_physical_block_size = log_blksize; + hfsmp->hfs_log_per_phys = 1; } if (args) { hfsmp->hfs_encoding = args->hfs_encoding; @@ -1078,6 +1111,11 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; hfsmp->hfs_logical_block_size = log_blksize; + + /* Update logical/physical block size */ + hfsmp->hfs_physical_block_size = log_blksize; + phys_blksize = log_blksize; + hfsmp->hfs_log_per_phys = 1; } disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) * @@ -1218,6 +1256,7 @@ hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, /* Note: relative block count adjustment (in case this is an embedded volume). */ hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize; hfsmp->hfs_logical_block_size = log_blksize; + hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize; if (hfsmp->jnl) { // close and re-open this with the new block size @@ -3155,9 +3194,6 @@ hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context) /* If ioctl is not supported, force physical and logical sector size to be same */ phys_sectorsize = sectorsize; } - if (phys_sectorsize != hfsmp->hfs_physical_block_size) { - return (ENXIO); - } oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize; /* @@ -4493,19 +4529,29 @@ end_iteration: /* Now move any files that are in the way. */ for (i = 0; i < filecnt; ++i) { struct vnode * rvp; + struct cnode * cp; if (hfs_vget(hfsmp, cnidbufp[i], &vp, 0) != 0) continue; + /* Relocating directory hard links is not supported, so we + * punt (see radar 6217026). */ + cp = VTOC(vp); + if ((cp->c_flag & C_HARDLINK) && vnode_isdir(vp)) { + printf("hfs_reclaimspace: unable to relocate directory hard link %d\n", cp->c_cnid); + error = EINVAL; + goto out; + } + /* Relocate any data fork blocks. */ - if (VTOF(vp)->ff_blocks > 0) { + if (VTOF(vp) && VTOF(vp)->ff_blocks > 0) { error = hfs_relocate(vp, hfsmp->hfs_metazone_end + 1, kauth_cred_get(), current_proc()); } if (error) break; /* Relocate any resource fork blocks. */ - if ((VTOC((vp))->c_blocks - VTOF((vp))->ff_blocks) > 0) { + if ((cp->c_blocks - (VTOF(vp) ? VTOF((vp))->ff_blocks : 0)) > 0) { error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE); if (error) break; @@ -4514,7 +4560,7 @@ end_iteration: if (error) break; } - hfs_unlock(VTOC(vp)); + hfs_unlock(cp); vnode_put(vp); vp = NULL; diff --git a/bsd/hfs/hfs_vfsutils.c b/bsd/hfs/hfs_vfsutils.c index 43e5ae8be..ce577ec74 100644 --- a/bsd/hfs/hfs_vfsutils.c +++ b/bsd/hfs/hfs_vfsutils.c @@ -346,6 +346,7 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, */ if (blockSize < hfsmp->hfs_physical_block_size) { hfsmp->hfs_physical_block_size = hfsmp->hfs_logical_block_size; + hfsmp->hfs_log_per_phys = 1; } /* @@ -438,14 +439,18 @@ OSErr hfs_MountHFSPlusVolume(struct hfsmount *hfsmp, HFSPlusVolumeHeader *vhp, retval = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr, &cfork, &hfsmp->hfs_extents_vp); if (retval) + { goto ErrorExit; + } hfsmp->hfs_extents_cp = VTOC(hfsmp->hfs_extents_vp); hfs_unlock(hfsmp->hfs_extents_cp); retval = MacToVFSError(BTOpenPath(VTOF(hfsmp->hfs_extents_vp), (KeyCompareProcPtr) CompareExtentKeysPlus)); if (retval) + { goto ErrorExit; + } /* * Set up Catalog B-tree vnode */ @@ -2372,6 +2377,16 @@ hfs_start_transaction(struct hfsmount *hfsmp) unlock_on_err = 1; } + /* If a downgrade to read-only mount is in progress, no other + * process than the downgrade process is allowed to modify + * the file system. + */ + if ((hfsmp->hfs_flags & HFS_RDONLY_DOWNGRADE) && + (hfsmp->hfs_downgrading_proc != thread)) { + ret = EROFS; + goto out; + } + if (hfsmp->jnl) { ret = journal_start_transaction(hfsmp->jnl); if (ret == 0) { @@ -2381,6 +2396,7 @@ hfs_start_transaction(struct hfsmount *hfsmp) ret = 0; } +out: if (ret != 0 && unlock_on_err) { lck_rw_unlock_shared(&hfsmp->hfs_global_lock); } diff --git a/bsd/kern/kdebug.c b/bsd/kern/kdebug.c index 737743535..ee4b63f40 100644 --- a/bsd/kern/kdebug.c +++ b/bsd/kern/kdebug.c @@ -1043,17 +1043,27 @@ kdbg_control_chud(int val, void *fn) int -kdbg_control(int *name, __unused u_int namelen, user_addr_t where, size_t *sizep) +kdbg_control(int *name, u_int namelen, user_addr_t where, size_t *sizep) { int ret=0; size_t size=*sizep; - unsigned int value = name[1]; + unsigned int value = 0; kd_regtype kd_Reg; kbufinfo_t kd_bufinfo; pid_t curpid; struct proc *p, *curproc; - + if (name[0] == KERN_KDGETENTROPY || + name[0] == KERN_KDEFLAGS || + name[0] == KERN_KDDFLAGS || + name[0] == KERN_KDENABLE || + name[0] == KERN_KDSETBUF) { + + if ( namelen < 2 ) + return(EINVAL); + value = name[1]; + } + kdbg_lock_init(); if ( !(kdebug_flags & KDBG_LOCKINIT)) diff --git a/bsd/kern/kern_credential.c b/bsd/kern/kern_credential.c index 89d2e8dbf..c7b4ca7a8 100644 --- a/bsd/kern/kern_credential.c +++ b/bsd/kern/kern_credential.c @@ -3239,9 +3239,15 @@ kauth_cred_label_update(kauth_cred_t cred, struct label *label) * vp The exec vnode * scriptl The script MAC label * execl The executable MAC label + * disjointp Pointer to flag to set if old + * and returned credentials are + * disjoint * * Returns: (kauth_cred_t) The updated credential * + * Implicit returns: + * *disjointp Set to 1 for disjoint creds + * * IMPORTANT: This function is implemented via kauth_cred_update(), which, * if it returns a credential other than the one it is passed, * will have dropped the reference on the passed credential. All @@ -3257,7 +3263,8 @@ kauth_cred_label_update(kauth_cred_t cred, struct label *label) static kauth_cred_t kauth_cred_label_update_execve(kauth_cred_t cred, vfs_context_t ctx, - struct vnode *vp, struct label *scriptl, struct label *execl) + struct vnode *vp, struct label *scriptl, struct label *execl, + int *disjointp) { kauth_cred_t newcred; struct ucred temp_cred; @@ -3266,8 +3273,8 @@ kauth_cred_label_update_execve(kauth_cred_t cred, vfs_context_t ctx, mac_cred_label_init(&temp_cred); mac_cred_label_associate(cred, &temp_cred); - mac_cred_label_update_execve(ctx, &temp_cred, - vp, scriptl, execl); + *disjointp = mac_cred_label_update_execve(ctx, &temp_cred, + vp, scriptl, execl); newcred = kauth_cred_update(cred, &temp_cred, TRUE); mac_cred_label_destroy(&temp_cred); @@ -3349,14 +3356,21 @@ int kauth_proc_label_update(struct proc *p, struct label *label) * scriptl The script MAC label * execl The executable MAC label * + * Returns: 0 Label update did not make credential + * disjoint + * 1 Label update caused credential to be + * disjoint + * * Notes: The credential associated with the process WILL change as a * result of this call. The caller should not assume the process * reference to the old credential still exists. */ -int kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, +int +kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, struct vnode *vp, struct label *scriptl, struct label *execl) { kauth_cred_t my_cred, my_new_cred; + int disjoint = 0; my_cred = kauth_cred_proc_ref(p); @@ -3372,7 +3386,7 @@ int kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, * passed in. The subsequent compare is safe, because it is * a pointer compare rather than a contents compare. */ - my_new_cred = kauth_cred_label_update_execve(my_cred, ctx, vp, scriptl, execl); + my_new_cred = kauth_cred_label_update_execve(my_cred, ctx, vp, scriptl, execl, &disjoint); if (my_cred != my_new_cred) { DEBUG_CRED_CHANGE("kauth_proc_label_update_execve_unlocked CH(%d): %p/0x%08x -> %p/0x%08x\n", p->p_pid, my_cred, my_cred->cr_flags, my_new_cred, my_new_cred->cr_flags); @@ -3400,7 +3414,7 @@ int kauth_proc_label_update_execve(struct proc *p, vfs_context_t ctx, /* Drop old proc reference or our extra reference */ kauth_cred_unref(&my_cred); - return (0); + return (disjoint); } #if 1 diff --git a/bsd/kern/kern_exec.c b/bsd/kern/kern_exec.c index 43ab48894..71dd14c12 100644 --- a/bsd/kern/kern_exec.c +++ b/bsd/kern/kern_exec.c @@ -2543,24 +2543,33 @@ exec_handle_sugid(struct image_params *imgp) kauth_cred_t cred = vfs_context_ucred(imgp->ip_vfs_context); proc_t p = vfs_context_proc(imgp->ip_vfs_context); int i; - int is_member = 0; + int leave_sugid_clear = 0; int error = 0; struct vnode *dev_null = NULLVP; -#if CONFIG_MACF - kauth_cred_t my_cred; -#endif - #if CONFIG_MACF int mac_transition; - mac_transition = mac_cred_check_label_update_execve(imgp->ip_vfs_context, imgp->ip_vp, - imgp->ip_scriptlabelp, imgp->ip_execlabelp, p); + + /* + * Determine whether a call to update the MAC label will result in the + * credential changing. + * + * Note: MAC policies which do not actually end up modifying + * the label subsequently are strongly encouraged to + * return 0 for this check, since a non-zero answer will + * slow down the exec fast path for normal binaries. + */ + mac_transition = mac_cred_check_label_update_execve( + imgp->ip_vfs_context, + imgp->ip_vp, + imgp->ip_scriptlabelp, + imgp->ip_execlabelp, p); #endif OSBitAndAtomic(~((uint32_t)P_SUGID), (UInt32 *)&p->p_flag); /* * Order of the following is important; group checks must go last, - * as we use the success of the 'is_member' check combined with the + * as we use the success of the 'ismember' check combined with the * failure of the explicit match to indicate that we will be setting * the egid of the process even though the new process did not * require VSUID/VSGID bits in order for it to set the new group as @@ -2574,13 +2583,15 @@ exec_handle_sugid(struct image_params *imgp) */ if (((imgp->ip_origvattr->va_mode & VSUID) != 0 && kauth_cred_getuid(cred) != imgp->ip_origvattr->va_uid) || -#if CONFIG_MACF - mac_transition || /* A policy wants to transition */ -#endif ((imgp->ip_origvattr->va_mode & VSGID) != 0 && - ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &is_member) || !is_member) || + ((kauth_cred_ismember_gid(cred, imgp->ip_origvattr->va_gid, &leave_sugid_clear) || !leave_sugid_clear) || (cred->cr_gid != imgp->ip_origvattr->va_gid)))) { +#if CONFIG_MACF +/* label for MAC transition and neither VSUID nor VSGID */ +handle_mac_transition: +#endif + /* * Replace the credential with a copy of itself if euid or * egid change. @@ -2606,28 +2617,36 @@ exec_handle_sugid(struct image_params *imgp) #if CONFIG_MACF /* - * XXXMAC: In FreeBSD, we set P_SUGID on a MAC transition - * to protect against debuggers being attached by an - * insufficiently privileged process onto the result of - * a transition to a more privileged credential. This is - * too conservative on FreeBSD, but we need to do - * something similar here, or risk vulnerability. - * - * Before we make the call into the MAC policies, get a new + * If a policy has indicated that it will transition the label, + * before making the call into the MAC policies, get a new * duplicate credential, so they can modify it without * modifying any others sharing it. */ - if (mac_transition && !imgp->ip_no_trans) { - kauth_proc_label_update_execve(p, - imgp->ip_vfs_context, - imgp->ip_vp, - imgp->ip_scriptlabelp, imgp->ip_execlabelp); + if (mac_transition) { + kauth_cred_t my_cred; + if (kauth_proc_label_update_execve(p, + imgp->ip_vfs_context, + imgp->ip_vp, + imgp->ip_scriptlabelp, + imgp->ip_execlabelp)) { + /* + * If updating the MAC label resulted in a + * disjoint credential, flag that we need to + * set the P_SUGID bit. This protects + * against debuggers being attached by an + * insufficiently privileged process onto the + * result of a transition to a more privileged + * credential. + */ + leave_sugid_clear = 0; + } my_cred = kauth_cred_proc_ref(p); mac_task_label_update_cred(my_cred, p->task); kauth_cred_unref(&my_cred); } -#endif +#endif /* CONFIG_MACF */ + /* * Have mach reset the task and thread ports. * We don't want anyone who had the ports before @@ -2640,13 +2659,15 @@ exec_handle_sugid(struct image_params *imgp) } /* - * If 'is_member' is non-zero, then we passed the VSUID and - * MACF checks, and successfully determined that the previous - * cred was a member of the VSGID group, but that it was not - * the default at the time of the execve. So we don't set the - * P_SUGID on the basis of simply running this code. + * If 'leave_sugid_clear' is non-zero, then we passed the + * VSUID and MACF checks, and successfully determined that + * the previous cred was a member of the VSGID group, but + * that it was not the default at the time of the execve, + * and that the post-labelling credential was not disjoint. + * So we don't set the P_SUGID on the basis of simply + * running this code. */ - if (!is_member) + if (!leave_sugid_clear) OSBitOrAtomic(P_SUGID, (UInt32 *)&p->p_flag); /* Cache the vnode for /dev/null the first time around */ @@ -2713,6 +2734,21 @@ exec_handle_sugid(struct image_params *imgp) dev_null = NULLVP; } } +#if CONFIG_MACF + else { + /* + * We are here because we were told that the MAC label will + * be transitioned, and the binary is not VSUID or VSGID; to + * deal with this case, we could either duplicate a lot of + * code, or we can indicate we want to default the P_SUGID + * bit clear and jump back up. + */ + if (mac_transition) { + leave_sugid_clear = 1; + goto handle_mac_transition; + } + } +#endif /* CONFIG_MACF */ /* * Implement the semantic where the effective user and group become diff --git a/bsd/kern/kern_lockf.c b/bsd/kern/kern_lockf.c index 4e61180b6..7269357e4 100644 --- a/bsd/kern/kern_lockf.c +++ b/bsd/kern/kern_lockf.c @@ -131,7 +131,15 @@ static struct lockf *lf_getblock(struct lockf *); static int lf_getlock(struct lockf *, struct flock *); static int lf_setlock(struct lockf *); static int lf_split(struct lockf *, struct lockf *); -static void lf_wakelock(struct lockf *); +static void lf_wakelock(struct lockf *, boolean_t); + + +/* + * in order to mitigate risk + * don't switch to new wake-one method unless + * we have at least this many waiters to wake up + */ +#define SAFE_WAITER_LIMIT 20 /* @@ -259,9 +267,13 @@ lf_advlock(struct vnop_advlock_args *ap) lock->lf_type = fl->l_type; lock->lf_head = head; lock->lf_next = (struct lockf *)0; + lock->lf_waiters = 0; TAILQ_INIT(&lock->lf_blkhd); lock->lf_flags = ap->a_flags; + if (ap->a_flags & F_FLOCK) + lock->lf_flags |= F_WAKE1_SAFE; + lck_mtx_lock(&vp->v_lock); /* protect the lockf list */ /* * Do the requested operation. @@ -502,6 +514,11 @@ lf_setlock(struct lockf *lock) */ lock->lf_next = block; TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); + block->lf_waiters++; + + if ( !(lock->lf_flags & F_FLOCK)) + block->lf_flags &= ~F_WAKE1_SAFE; + #ifdef LOCKF_DEBUGGING if (lockf_debug & 1) { lf_print("lf_setlock: blocking on", block); @@ -509,6 +526,20 @@ lf_setlock(struct lockf *lock) } #endif /* LOCKF_DEBUGGING */ error = msleep(lock, &vp->v_lock, priority, lockstr, 0); + + if (!TAILQ_EMPTY(&lock->lf_blkhd)) { + struct lockf *tlock; + + if ((block = lf_getblock(lock))) { + TAILQ_FOREACH(tlock, &lock->lf_blkhd, lf_block) { + tlock->lf_next = block; + } + TAILQ_CONCAT(&block->lf_blkhd, &lock->lf_blkhd, lf_block); + + block->lf_waiters += lock->lf_waiters; + lock->lf_waiters = 0; + } + } if (error) { /* XXX */ /* * We may have been awakened by a signal and/or by a @@ -520,8 +551,12 @@ lf_setlock(struct lockf *lock) */ if (lock->lf_next) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block); + lock->lf_next->lf_waiters--; lock->lf_next = NOLOCKF; } + if (!TAILQ_EMPTY(&lock->lf_blkhd)) + lf_wakelock(lock, TRUE); + FREE(lock, M_LOCKF); return (error); } /* XXX */ @@ -565,7 +600,7 @@ lf_setlock(struct lockf *lock) */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) - lf_wakelock(overlap); + lf_wakelock(overlap, TRUE); overlap->lf_type = lock->lf_type; FREE(lock, M_LOCKF); lock = overlap; /* for lf_coelesce_adjacent() */ @@ -595,7 +630,7 @@ lf_setlock(struct lockf *lock) return (ENOLCK); } } - lf_wakelock(overlap); + lf_wakelock(overlap, TRUE); break; case OVERLAP_CONTAINED_BY_LOCK: @@ -605,14 +640,18 @@ lf_setlock(struct lockf *lock) */ if (lock->lf_type == F_RDLCK && overlap->lf_type == F_WRLCK) { - lf_wakelock(overlap); + lf_wakelock(overlap, TRUE); } else { while (!TAILQ_EMPTY(&overlap->lf_blkhd)) { ltmp = TAILQ_FIRST(&overlap->lf_blkhd); TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, lf_block); + overlap->lf_waiters--; + TAILQ_INSERT_TAIL(&lock->lf_blkhd, ltmp, lf_block); + lock->lf_waiters++; + ltmp->lf_next = lock; } } @@ -637,7 +676,7 @@ lf_setlock(struct lockf *lock) overlap->lf_next = lock; overlap->lf_end = lock->lf_start - 1; prev = &lock->lf_next; - lf_wakelock(overlap); + lf_wakelock(overlap, TRUE); needtolink = 0; continue; @@ -650,7 +689,7 @@ lf_setlock(struct lockf *lock) lock->lf_next = overlap; } overlap->lf_start = lock->lf_end + 1; - lf_wakelock(overlap); + lf_wakelock(overlap, TRUE); break; } break; @@ -704,7 +743,7 @@ lf_clearlock(struct lockf *unlock) /* * Wakeup the list of locks to be retried. */ - lf_wakelock(overlap); + lf_wakelock(overlap, FALSE); switch (ovcase) { case OVERLAP_NONE: /* satisfy compiler enum/switch */ @@ -1048,19 +1087,42 @@ lf_split(struct lockf *lock1, struct lockf *lock2) * in a real-world performance problem. */ static void -lf_wakelock(struct lockf *listhead) +lf_wakelock(struct lockf *listhead, boolean_t force_all) { struct lockf *wakelock; + boolean_t wake_all = TRUE; + + if (force_all == FALSE && (listhead->lf_flags & F_WAKE1_SAFE) && listhead->lf_waiters > SAFE_WAITER_LIMIT) + wake_all = FALSE; while (!TAILQ_EMPTY(&listhead->lf_blkhd)) { wakelock = TAILQ_FIRST(&listhead->lf_blkhd); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); + listhead->lf_waiters--; + wakelock->lf_next = NOLOCKF; #ifdef LOCKF_DEBUGGING if (lockf_debug & 2) lf_print("lf_wakelock: awakening", wakelock); #endif /* LOCKF_DEBUGGING */ + if (wake_all == FALSE) { + + TAILQ_CONCAT(&wakelock->lf_blkhd, &listhead->lf_blkhd, lf_block); + wakelock->lf_waiters = listhead->lf_waiters; + listhead->lf_waiters = 0; + + if (!TAILQ_EMPTY(&wakelock->lf_blkhd)) { + struct lockf *tlock; + + TAILQ_FOREACH(tlock, &wakelock->lf_blkhd, lf_block) { + tlock->lf_next = wakelock; + } + } + } wakeup(wakelock); + + if (wake_all == FALSE) + break; } } diff --git a/bsd/kern/kern_mib.c b/bsd/kern/kern_mib.c index 01ac7e637..0788ed33e 100644 --- a/bsd/kern/kern_mib.c +++ b/bsd/kern/kern_mib.c @@ -654,6 +654,4 @@ sysctl_mib_init(void) # warning we do not support this platform yet #endif /* __ppc__ */ - } - diff --git a/bsd/kern/kern_sysctl.c b/bsd/kern/kern_sysctl.c index b8673b4bc..d1985d21d 100644 --- a/bsd/kern/kern_sysctl.c +++ b/bsd/kern/kern_sysctl.c @@ -1588,6 +1588,9 @@ kdebug_ops(int *name, u_int namelen, user_addr_t where, { int ret=0; + if (namelen == 0) + return(ENOTSUP); + ret = suser(kauth_cred_get(), &p->p_acflag); if (ret) return(ret); @@ -1637,7 +1640,7 @@ sysctl_procargs2(int *name, u_int namelen, user_addr_t where, } static int -sysctl_procargsx(int *name, __unused u_int namelen, user_addr_t where, +sysctl_procargsx(int *name, u_int namelen, user_addr_t where, size_t *sizep, proc_t cur_proc, int argc_yes) { proc_t p; @@ -1657,6 +1660,9 @@ sysctl_procargsx(int *name, __unused u_int namelen, user_addr_t where, kauth_cred_t my_cred; uid_t uid; + if ( namelen < 1 ) + return(EINVAL); + if (argc_yes) buflen -= sizeof(int); /* reserve first word to return argc */ diff --git a/bsd/kern/kpi_socketfilter.c b/bsd/kern/kpi_socketfilter.c index cefe30473..76cb302f7 100644 --- a/bsd/kern/kpi_socketfilter.c +++ b/bsd/kern/kpi_socketfilter.c @@ -36,6 +36,8 @@ #include #include +#include + #include static struct socket_filter_list sock_filter_head; @@ -327,8 +329,7 @@ sflt_detach_private( lck_mtx_unlock(sock_filter_lock); return; } - } - else { + } else { /* * Clear the removing flag. We will perform the detach here or * request a delayed detach. Since we do an extra ref release @@ -344,9 +345,19 @@ sflt_detach_private( if (entry->sfe_socket->so_filteruse != 0) { entry->sfe_flags |= SFEF_DETACHUSEZERO; lck_mtx_unlock(sock_filter_lock); + + if (unregistering) { +#if DEBUG + printf("sflt_detach_private unregistering SFEF_DETACHUSEZERO " + "so%p so_filteruse %u so_usecount %d\n", + entry->sfe_socket, entry->sfe_socket->so_filteruse, + entry->sfe_socket->so_usecount); +#endif + socket_unlock(entry->sfe_socket, 0); + } + return; - } - else { + } else { /* * Check if we are removing the last attached filter and * the parent filter is being unregistered. diff --git a/bsd/kern/mach_loader.c b/bsd/kern/mach_loader.c index 123339d3a..f6ec97d7d 100644 --- a/bsd/kern/mach_loader.c +++ b/bsd/kern/mach_loader.c @@ -568,10 +568,13 @@ parse_machfile( (struct encryption_info_command *) lcp, addr, map, vp); if (ret != LOAD_SUCCESS) { - printf("proc %d: set unprotect error %d " + printf("proc %d: set_code_unprotect() error %d " "for file \"%s\"\n", p->p_pid, ret, vp->v_name); - ret = LOAD_SUCCESS; /* ignore error */ + /* Don't let the app run if it's + * encrypted but we failed to set up the + * decrypter */ + psignal(p, SIGKILL); } break; #endif @@ -1451,7 +1454,7 @@ set_code_unprotect( cryptname="com.apple.null"; break; default: - return LOAD_FAILURE; + return LOAD_BADMACHO; } len = MAXPATHLEN; @@ -1463,9 +1466,9 @@ set_code_unprotect( kr=text_crypter_create(&crypt_info, cryptname, (void*)vpath); if(kr) { - printf("set_code_unprotect: unable to find decrypter %s, kr=%d\n", + printf("set_code_unprotect: unable to create decrypter %s, kr=%d\n", cryptname, kr); - return LOAD_FAILURE; + return LOAD_RESOURCE; } /* this is terrible, but we have to rescan the load commands to find the @@ -1509,12 +1512,16 @@ set_code_unprotect( } /* if we get here, did not find anything */ - return LOAD_FAILURE; + return LOAD_BADMACHO; remap_now: /* now remap using the decrypter */ kr = vm_map_apple_protected(map, map_offset, map_offset+map_size, &crypt_info); - if(kr) printf("set_code_unprotect(): mapping failed with %x\n", kr); + if(kr) { + printf("set_code_unprotect(): mapping failed with %x\n", kr); + crypt_info.crypt_end(crypt_info.crypt_ops); + return LOAD_PROTECT; + } return LOAD_SUCCESS; } diff --git a/bsd/kern/pthread_synch.c b/bsd/kern/pthread_synch.c index 4ccfd04fc..be4e9c165 100644 --- a/bsd/kern/pthread_synch.c +++ b/bsd/kern/pthread_synch.c @@ -1556,6 +1556,9 @@ workq_ops(struct proc *p, struct workq_ops_args *uap, __unused register_t *retv KERNEL_DEBUG(0xefffd008 | DBG_FUNC_NONE, (int)item, 0, 0, 0, 0); + if ((prio < 0) || (prio >= 5)) + return (EINVAL); + workqueue_lock_spin(p); if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { @@ -1568,6 +1571,9 @@ workq_ops(struct proc *p, struct workq_ops_args *uap, __unused register_t *retv break; case WQOPS_QUEUE_REMOVE: { + if ((prio < 0) || (prio >= 5)) + return (EINVAL); + workqueue_lock_spin(p); if ((wq = (struct workqueue *)p->p_wqptr) == NULL) { diff --git a/bsd/kern/uipc_mbuf.c b/bsd/kern/uipc_mbuf.c index 1784e5f1d..0c0a27855 100644 --- a/bsd/kern/uipc_mbuf.c +++ b/bsd/kern/uipc_mbuf.c @@ -388,7 +388,6 @@ typedef struct mcl_slab { * whenever a new piece of memory mapped in from the VM crosses the 1MB * boundary. */ -#define MBSHIFT 20 /* 1MB */ #define NSLABSPMB ((1 << MBSHIFT) >> MCLSHIFT) /* 512 slabs/grp */ typedef struct mcl_slabg { diff --git a/bsd/net/if.c b/bsd/net/if.c index 31d3cd082..2d3d48ae7 100644 --- a/bsd/net/if.c +++ b/bsd/net/if.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -103,6 +103,7 @@ /*XXX*/ #include #include +#include #if INET6 #include #include @@ -144,6 +145,9 @@ struct ifnethead ifnet_head = TAILQ_HEAD_INITIALIZER(ifnet_head); static int if_cloners_count; LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners); +static struct ifaddr *ifa_ifwithnet_common(const struct sockaddr *, + unsigned int); + #if INET6 /* * XXX: declare here to avoid to include many inet6 related files.. @@ -641,13 +645,77 @@ ifa_ifwithdstaddr( return result; } +/* + * Locate the source address of an interface based on a complete address. + */ +struct ifaddr * +ifa_ifwithaddr_scoped(const struct sockaddr *addr, unsigned int ifscope) +{ + struct ifaddr *result = NULL; + struct ifnet *ifp; + + if (ifscope == IFSCOPE_NONE) + return (ifa_ifwithaddr(addr)); + + ifnet_head_lock_shared(); + if (ifscope > (unsigned int)if_index) { + ifnet_head_done(); + return (NULL); + } + + ifp = ifindex2ifnet[ifscope]; + if (ifp != NULL) { + struct ifaddr *ifa = NULL; + + /* + * This is suboptimal; there should be a better way + * to search for a given address of an interface. + */ + ifnet_lock_shared(ifp); + for (ifa = ifp->if_addrhead.tqh_first; ifa != NULL; + ifa = ifa->ifa_link.tqe_next) { + if (ifa->ifa_addr->sa_family != addr->sa_family) + continue; + if (equal(addr, ifa->ifa_addr)) { + result = ifa; + break; + } + if ((ifp->if_flags & IFF_BROADCAST) && + ifa->ifa_broadaddr != NULL && + /* IP6 doesn't have broadcast */ + ifa->ifa_broadaddr->sa_len != 0 && + equal(ifa->ifa_broadaddr, addr)) { + result = ifa; + break; + } + } + if (result != NULL) + ifaref(result); + ifnet_lock_done(ifp); + } + ifnet_head_done(); + + return (result); +} + +struct ifaddr * +ifa_ifwithnet(const struct sockaddr *addr) +{ + return (ifa_ifwithnet_common(addr, IFSCOPE_NONE)); +} + +struct ifaddr * +ifa_ifwithnet_scoped(const struct sockaddr *addr, unsigned int ifscope) +{ + return (ifa_ifwithnet_common(addr, ifscope)); +} + /* * Find an interface on a specific network. If many, choice * is most specific found. */ -struct ifaddr * -ifa_ifwithnet( - const struct sockaddr *addr) +static struct ifaddr * +ifa_ifwithnet_common(const struct sockaddr *addr, unsigned int ifscope) { struct ifnet *ifp; struct ifaddr *ifa = NULL; @@ -655,6 +723,9 @@ ifa_ifwithnet( u_int af = addr->sa_family; const char *addr_data = addr->sa_data, *cplim; + if (!ip_doscopedroute || addr->sa_family != AF_INET) + ifscope = IFSCOPE_NONE; + ifnet_head_lock_shared(); /* * AF_LINK addresses can be looked up directly by their index number, @@ -711,6 +782,14 @@ next: continue; } else #endif /* __APPLE__*/ { + /* + * If we're looking up with a scope, + * find using a matching interface. + */ + if (ifscope != IFSCOPE_NONE && + ifp->if_index != ifscope) + continue; + /* * if we have a special address handler, * then use it instead of the generic one. diff --git a/bsd/net/if_var.h b/bsd/net/if_var.h index 02735e846..f26aebe00 100644 --- a/bsd/net/if_var.h +++ b/bsd/net/if_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -672,10 +672,14 @@ void ifma_reference(struct ifmultiaddr *ifma); void ifma_release(struct ifmultiaddr *ifma); struct ifaddr *ifa_ifwithaddr(const struct sockaddr *); +struct ifaddr *ifa_ifwithaddr_scoped(const struct sockaddr *, unsigned int); struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *); struct ifaddr *ifa_ifwithnet(const struct sockaddr *); +struct ifaddr *ifa_ifwithnet_scoped(const struct sockaddr *, unsigned int); struct ifaddr *ifa_ifwithroute(int, const struct sockaddr *, const struct sockaddr *); struct ifaddr *ifa_ifwithroute_locked(int, const struct sockaddr *, const struct sockaddr *); +struct ifaddr *ifa_ifwithroute_scoped_locked(int, const struct sockaddr *, + const struct sockaddr *, unsigned int); struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *); struct ifaddr *ifa_ifpgetprimary(struct ifnet *, int); void ifafree(struct ifaddr *); diff --git a/bsd/net/radix.c b/bsd/net/radix.c index 36aa3bc0d..876675d54 100644 --- a/bsd/net/radix.c +++ b/bsd/net/radix.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -112,8 +112,10 @@ static int rn_lexobetter(void *m_arg, void *n_arg); static struct radix_mask * rn_new_radix_mask(struct radix_node *tt, struct radix_mask *next); -static int rn_satsifies_leaf(char *trial, struct radix_node *leaf, - int skip); +static int rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip, + rn_matchf_t *f, void *w); + +#define RN_MATCHF(rn, f, arg) (f == NULL || (*f)((rn), arg)) /* * The data structure for the keys is a radix tree with one way @@ -208,6 +210,13 @@ rn_refines(void *m_arg, void *n_arg) struct radix_node * rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) +{ + return (rn_lookup_args(v_arg, m_arg, head, NULL, NULL)); +} + +struct radix_node * +rn_lookup_args(void *v_arg, void *m_arg, struct radix_node_head *head, + rn_matchf_t *f, void *w) { struct radix_node *x; caddr_t netmask = NULL; @@ -218,7 +227,7 @@ rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) return (NULL); netmask = x->rn_key; } - x = rn_match(v_arg, head); + x = rn_match_args(v_arg, head, f, w); if (x && netmask) { while (x && x->rn_mask != netmask) x = x->rn_dupedkey; @@ -226,8 +235,16 @@ rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head) return x; } +/* + * Returns true if address 'trial' has no bits differing from the + * leaf's key when compared under the leaf's mask. In other words, + * returns true when 'trial' matches leaf. If a leaf-matching + * routine is passed in, it is also used to find a match on the + * conditions defined by the caller of rn_match. + */ static int -rn_satsifies_leaf(char *trial, struct radix_node *leaf, int skip) +rn_satisfies_leaf(char *trial, struct radix_node *leaf, int skip, + rn_matchf_t *f, void *w) { char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask; char *cplim; @@ -241,11 +258,19 @@ rn_satsifies_leaf(char *trial, struct radix_node *leaf, int skip) for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3) return 0; - return 1; + + return (RN_MATCHF(leaf, f, w)); } struct radix_node * rn_match(void *v_arg, struct radix_node_head *head) +{ + return (rn_match_args(v_arg, head, NULL, NULL)); +} + +struct radix_node * +rn_match_args(void *v_arg, struct radix_node_head *head, + rn_matchf_t *f, void *w) { caddr_t v = v_arg; struct radix_node *t = head->rnh_treetop, *x; @@ -291,11 +316,26 @@ rn_match(void *v_arg, struct radix_node_head *head) */ if (t->rn_flags & RNF_ROOT) t = t->rn_dupedkey; - return t; + if (t == NULL || RN_MATCHF(t, f, w)) { + return (t); + } else { + /* + * Although we found an exact match on the key, + * f() is looking for some other criteria as well. + * Continue looking as if the exact match failed. + */ + if (t->rn_parent->rn_flags & RNF_ROOT) { + /* Hit the top; have to give up */ + return (NULL); + } + b = 0; + goto keeplooking; + } on1: test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */ for (b = 7; (test >>= 1) > 0;) b--; +keeplooking: matched_off = cp - v; b += matched_off << 3; rn_bit = -1 - b; @@ -304,17 +344,19 @@ on1: */ if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; - for (; t; t = t->rn_dupedkey) + for (; t; t = t->rn_dupedkey) { /* * Even if we don't match exactly as a host, * we may match if the leaf we wound up at is * a route to a net. */ if (t->rn_flags & RNF_NORMAL) { - if (rn_bit <= t->rn_bit) - return t; - } else if (rn_satsifies_leaf(v, t, matched_off)) - return t; + if ((rn_bit <= t->rn_bit) && RN_MATCHF(t, f, w)) + return (t); + } else if (rn_satisfies_leaf(v, t, matched_off, f, w)) { + return (t); + } + } t = saved_t; /* start searching up the tree */ do { @@ -329,20 +371,21 @@ on1: */ while (m) { if (m->rm_flags & RNF_NORMAL) { - if (rn_bit <= m->rm_bit) + if ((rn_bit <= m->rm_bit) && + RN_MATCHF(m->rm_leaf, f, w)) return (m->rm_leaf); } else { off = min(t->rn_offset, matched_off); x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask) x = x->rn_dupedkey; - if (x && rn_satsifies_leaf(v, x, off)) - return x; + if (x && rn_satisfies_leaf(v, x, off, f, w)) + return (x); } m = m->rm_mklist; } } while (t != top); - return NULL; + return (NULL); } #ifdef RN_DEBUG @@ -1093,7 +1136,9 @@ rn_inithead(void **head, int off) rnh->rnh_addaddr = rn_addroute; rnh->rnh_deladdr = rn_delete; rnh->rnh_matchaddr = rn_match; + rnh->rnh_matchaddr_args = rn_match_args; rnh->rnh_lookup = rn_lookup; + rnh->rnh_lookup_args = rn_lookup_args; rnh->rnh_walktree = rn_walktree; rnh->rnh_walktree_from = rn_walktree_from; rnh->rnh_treetop = t; diff --git a/bsd/net/radix.h b/bsd/net/radix.h index 3431a2d25..6fa9f77ca 100644 --- a/bsd/net/radix.h +++ b/bsd/net/radix.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -140,6 +140,7 @@ struct radix_mask { #define MKFree(m) { (m)->rm_mklist = rn_mkfreelist; rn_mkfreelist = (m);} typedef int walktree_f_t(struct radix_node *, void *); +typedef int rn_matchf_t(struct radix_node *, void *); struct radix_node_head { struct radix_node *rnh_treetop; @@ -157,8 +158,16 @@ struct radix_node_head { (void *v, void *mask, struct radix_node_head *head); struct radix_node *(*rnh_matchaddr) /* locate based on sockaddr */ (void *v, struct radix_node_head *head); + /* locate based on sockaddr and rn_matchf_t() */ + struct radix_node *(*rnh_matchaddr_args) + (void *v, struct radix_node_head *head, + rn_matchf_t *f, void *w); struct radix_node *(*rnh_lookup) /* locate based on sockaddr */ (void *v, void *mask, struct radix_node_head *head); + /* locate based on sockaddr, mask and rn_matchf_t() */ + struct radix_node *(*rnh_lookup_args) + (void *v, void *mask, struct radix_node_head *head, + rn_matchf_t *f, void *); struct radix_node *(*rnh_matchpkt) /* locate based on packet hdr */ (void *v, struct radix_node_head *head); int (*rnh_walktree) /* traverse tree */ @@ -195,7 +204,10 @@ struct radix_node struct radix_node [2]), *rn_delete(void *, void *, struct radix_node_head *), *rn_lookup(void *v_arg, void *m_arg, struct radix_node_head *head), - *rn_match(void *, struct radix_node_head *); + *rn_lookup_args(void *v_arg, void *m_arg, struct radix_node_head *head, + rn_matchf_t *, void *), + *rn_match(void *, struct radix_node_head *), + *rn_match_args(void *, struct radix_node_head *, rn_matchf_t *, void *); #endif /* PRIVATE */ #endif /* _RADIX_H_ */ diff --git a/bsd/net/route.c b/bsd/net/route.c index 7f4ec5ac6..4e4fb302e 100644 --- a/bsd/net/route.c +++ b/bsd/net/route.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -76,7 +76,9 @@ #include #include +#include #include +#include #include @@ -166,10 +168,198 @@ static void rt_maskedcopy(struct sockaddr *, static void rtable_init(void **); static inline void rtref_audit(struct rtentry_dbg *); static inline void rtunref_audit(struct rtentry_dbg *); +static struct rtentry *rtalloc1_common_locked(struct sockaddr *, int, u_long, + unsigned int); +static int rtrequest_common_locked(int, struct sockaddr *, + struct sockaddr *, struct sockaddr *, int, struct rtentry **, + unsigned int); +static void rtalloc_ign_common_locked(struct route *, u_long, unsigned int); +static inline void sa_set_ifscope(struct sockaddr *, unsigned int); +static struct sockaddr *sin_copy(struct sockaddr_in *, struct sockaddr_in *, + unsigned int); +static struct sockaddr *mask_copy(struct sockaddr *, struct sockaddr_in *, + unsigned int); +static struct radix_node *node_lookup(struct sockaddr *, struct sockaddr *, + unsigned int); +static struct radix_node *node_lookup_default(void); +static int rn_match_ifscope(struct radix_node *, void *); +static struct ifaddr *ifa_ifwithroute_common_locked(int, + const struct sockaddr *, const struct sockaddr *, unsigned int); __private_extern__ u_long route_generation = 0; extern int use_routegenid; +/* + * sockaddr_in with embedded interface scope; this is used internally + * to keep track of scoped route entries in the routing table. The + * fact that such a scope is embedded in the structure is an artifact + * of the current implementation which could change in future. + */ +struct sockaddr_inifscope { + __uint8_t sin_len; + sa_family_t sin_family; + in_port_t sin_port; + struct in_addr sin_addr; + /* + * To avoid possible conflict with an overlaid sockaddr_inarp + * having sin_other set to SIN_PROXY, we use the first 4-bytes + * of sin_zero since sin_srcaddr is one of the unused fields + * in sockaddr_inarp. + */ + union { + char sin_zero[8]; + struct { + __uint32_t ifscope; + } _in_index; + } un; +#define sin_ifscope un._in_index.ifscope +}; + +#define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) +#define SINIFSCOPE(sa) ((struct sockaddr_inifscope *)(size_t)(sa)) + +#define ASSERT_SINIFSCOPE(sa) { \ + if ((sa)->sa_family != AF_INET || \ + (sa)->sa_len < sizeof (struct sockaddr_in)) \ + panic("%s: bad sockaddr_in %p\n", __func__, sa); \ +} + +/* + * Argument to leaf-matching routine; at present it is scoped routing + * specific but can be expanded in future to include other search filters. + */ +struct matchleaf_arg { + unsigned int ifscope; /* interface scope */ +}; + +/* + * For looking up the non-scoped default route (sockaddr instead + * of sockaddr_in for convenience). + */ +static struct sockaddr sin_def = { + sizeof (struct sockaddr_in), AF_INET, { 0, } +}; + +/* + * Interface index (scope) of the primary interface; determined at + * the time when the default, non-scoped route gets added, changed + * or deleted. Protected by rt_mtx. + */ +static unsigned int primary_ifscope = IFSCOPE_NONE; + +#define INET_DEFAULT(dst) \ + ((dst)->sa_family == AF_INET && SIN(dst)->sin_addr.s_addr == 0) + +#define RT(r) ((struct rtentry *)r) +#define RT_HOST(r) (RT(r)->rt_flags & RTF_HOST) + +/* + * Given a route, determine whether or not it is the non-scoped default + * route; dst typically comes from rt_key(rt) but may be coming from + * a separate place when rt is in the process of being created. + */ +boolean_t +rt_inet_default(struct rtentry *rt, struct sockaddr *dst) +{ + return (INET_DEFAULT(dst) && !(rt->rt_flags & RTF_IFSCOPE)); +} + +/* + * Set the ifscope of the primary interface; caller holds rt_mtx. + */ +void +set_primary_ifscope(unsigned int ifscope) +{ + primary_ifscope = ifscope; +} + +/* + * Return the ifscope of the primary interface; caller holds rt_mtx. + */ +unsigned int +get_primary_ifscope(void) +{ + return (primary_ifscope); +} + +/* + * Embed ifscope into a given a sockaddr_in. + */ +static inline void +sa_set_ifscope(struct sockaddr *sa, unsigned int ifscope) +{ + /* Caller must pass in sockaddr_in */ + ASSERT_SINIFSCOPE(sa); + + SINIFSCOPE(sa)->sin_ifscope = ifscope; +} + +/* + * Given a sockaddr_in, return the embedded ifscope to the caller. + */ +unsigned int +sa_get_ifscope(struct sockaddr *sa) +{ + /* Caller must pass in sockaddr_in */ + ASSERT_SINIFSCOPE(sa); + + return (SINIFSCOPE(sa)->sin_ifscope); +} + +/* + * Copy a sockaddr_in src to dst and embed ifscope into dst. + */ +static struct sockaddr * +sin_copy(struct sockaddr_in *src, struct sockaddr_in *dst, unsigned int ifscope) +{ + *dst = *src; + sa_set_ifscope(SA(dst), ifscope); + + return (SA(dst)); +} + +/* + * Copy a mask from src to a sockaddr_in dst and embed ifscope into dst. + */ +static struct sockaddr * +mask_copy(struct sockaddr *src, struct sockaddr_in *dst, unsigned int ifscope) +{ + /* We know dst is at least the size of sockaddr{_in} */ + bzero(dst, sizeof (*dst)); + rt_maskedcopy(src, SA(dst), src); + + /* + * The length of the mask sockaddr would need to be adjusted + * to cover the additional sin_ifscope field; when ifscope is + * IFSCOPE_NONE, we'd end up clearing the embedded ifscope on + * the destination mask in addition to extending the length + * of the sockaddr, as a side effect. This is okay, as any + * trailing zeroes would be skipped by rn_addmask prior to + * inserting or looking up the mask in the mask tree. + */ + SINIFSCOPE(dst)->sin_ifscope = ifscope; + SINIFSCOPE(dst)->sin_len = + offsetof(struct sockaddr_inifscope, sin_ifscope) + + sizeof (SINIFSCOPE(dst)->sin_ifscope); + + return (SA(dst)); +} + +/* + * Callback leaf-matching routine for rn_matchaddr_args used + * for looking up an exact match for a scoped route entry. + */ +static int +rn_match_ifscope(struct radix_node *rn, void *arg) +{ + struct rtentry *rt = (struct rtentry *)rn; + struct matchleaf_arg *ma = arg; + + if (!(rt->rt_flags & RTF_IFSCOPE) || rt_key(rt)->sa_family != AF_INET) + return (0); + + return (SINIFSCOPE(rt_key(rt))->sin_ifscope == ma->ifscope); +} static void rtable_init(void **table) @@ -232,17 +422,29 @@ rtalloc(struct route *ro) void rtalloc_ign_locked(struct route *ro, u_long ignore) +{ + return (rtalloc_ign_common_locked(ro, ignore, IFSCOPE_NONE)); +} + +void +rtalloc_scoped_ign_locked(struct route *ro, u_long ignore, unsigned int ifscope) +{ + return (rtalloc_ign_common_locked(ro, ignore, ifscope)); +} + +static void +rtalloc_ign_common_locked(struct route *ro, u_long ignore, + unsigned int ifscope) { struct rtentry *rt; if ((rt = ro->ro_rt) != NULL) { if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP) return; - /* XXX - We are probably always at splnet here already. */ rtfree_locked(rt); ro->ro_rt = NULL; } - ro->ro_rt = rtalloc1_locked(&ro->ro_dst, 1, ignore); + ro->ro_rt = rtalloc1_common_locked(&ro->ro_dst, 1, ignore, ifscope); if (ro->ro_rt) ro->ro_rt->generation_id = route_generation; } @@ -255,76 +457,99 @@ rtalloc_ign(struct route *ro, u_long ignore) lck_mtx_unlock(rt_mtx); } +struct rtentry * +rtalloc1_locked(struct sockaddr *dst, int report, u_long ignflags) +{ + return (rtalloc1_common_locked(dst, report, ignflags, IFSCOPE_NONE)); +} + +struct rtentry * +rtalloc1_scoped_locked(struct sockaddr *dst, int report, u_long ignflags, + unsigned int ifscope) +{ + return (rtalloc1_common_locked(dst, report, ignflags, ifscope)); +} + /* * Look up the route that matches the address given * Or, at least try.. Create a cloned route if needed. */ -struct rtentry * -rtalloc1_locked(struct sockaddr *dst, int report, u_long ignflags) +static struct rtentry * +rtalloc1_common_locked(struct sockaddr *dst, int report, u_long ignflags, + unsigned int ifscope) { struct radix_node_head *rnh = rt_tables[dst->sa_family]; - struct rtentry *rt; - struct radix_node *rn; - struct rtentry *newrt = 0; + struct rtentry *rt, *newrt = NULL; struct rt_addrinfo info; u_long nflags; int err = 0, msgtype = RTM_MISS; + + if (rnh == NULL) + goto unreachable; + /* - * Look up the address in the table for that Address Family + * Find the longest prefix or exact (in the scoped case) address match; + * callee adds a reference to entry and checks for root node as well */ - if (rnh && (rn = rnh->rnh_matchaddr((caddr_t)dst, rnh)) && - ((rn->rn_flags & RNF_ROOT) == 0)) { + rt = rt_lookup(FALSE, dst, NULL, rnh, ifscope); + if (rt == NULL) + goto unreachable; + + newrt = rt; + nflags = rt->rt_flags & ~ignflags; + if (report && (nflags & (RTF_CLONING | RTF_PRCLONING))) { /* - * If we find it and it's not the root node, then - * get a refernce on the rtentry associated. + * We are apparently adding (report = 0 in delete). + * If it requires that it be cloned, do so. + * (This implies it wasn't a HOST route.) */ - newrt = rt = (struct rtentry *)rn; - nflags = rt->rt_flags & ~ignflags; - if (report && (nflags & (RTF_CLONING | RTF_PRCLONING))) { + err = rtrequest_locked(RTM_RESOLVE, dst, NULL, NULL, 0, &newrt); + if (err) { /* - * We are apparently adding (report = 0 in delete). - * If it requires that it be cloned, do so. - * (This implies it wasn't a HOST route.) + * If the cloning didn't succeed, maybe what we + * have from lookup above will do. Return that; + * no need to hold another reference since it's + * already done. */ - err = rtrequest_locked(RTM_RESOLVE, dst, SA(0), - SA(0), 0, &newrt); - if (err) { - /* - * If the cloning didn't succeed, maybe - * what we have will do. Return that. - */ - newrt = rt; - rtref(rt); - goto miss; - } - if ((rt = newrt) && (rt->rt_flags & RTF_XRESOLVE)) { - /* - * If the new route specifies it be - * externally resolved, then go do that. - */ - msgtype = RTM_RESOLVE; - goto miss; - } - } else - rtref(rt); - } else { + newrt = rt; + goto miss; + } + /* - * Either we hit the root or couldn't find any match, - * Which basically means - * "caint get there frm here" + * We cloned it; drop the original route found during lookup. + * The resulted cloned route (newrt) would now have an extra + * reference held during rtrequest. */ - rtstat.rts_unreach++; - miss: if (report) { + rtfree_locked(rt); + if ((rt = newrt) && (rt->rt_flags & RTF_XRESOLVE)) { /* - * If required, report the failure to the supervising - * Authorities. - * For a delete, this is not an error. (report == 0) + * If the new route specifies it be + * externally resolved, then go do that. */ - bzero((caddr_t)&info, sizeof(info)); - info.rti_info[RTAX_DST] = dst; - rt_missmsg(msgtype, &info, 0, err); + msgtype = RTM_RESOLVE; + goto miss; } } + goto done; + +unreachable: + /* + * Either we hit the root or couldn't find any match, + * Which basically means "cant get there from here" + */ + rtstat.rts_unreach++; +miss: + if (report) { + /* + * If required, report the failure to the supervising + * Authorities. + * For a delete, this is not an error. (report == 0) + */ + bzero((caddr_t)&info, sizeof(info)); + info.rti_info[RTAX_DST] = dst; + rt_missmsg(msgtype, &info, 0, err); + } +done: return (newrt); } @@ -370,10 +595,6 @@ rtfree_locked(struct rtentry *rt) if (rt->rt_refcnt > 0) return; - if ((rt->rt_flags & RTF_TRACKREFS) != 0) - printf("%s rt(%p)->rt_refcnt(%d), caller=%p\n", __FUNCTION__, - rt, rt->rt_refcnt, __builtin_return_address(0)); - /* * On last reference give the "close method" a chance to cleanup * private state. This also permits (for IPv4 and IPv6) a chance @@ -500,10 +721,6 @@ rtref(struct rtentry *p) rtref_audit((struct rtentry_dbg *)p); p->rt_refcnt++; - - if ((p->rt_flags & RTF_TRACKREFS) != 0) - printf("%s rt(%p)->rt_refcnt(%d), caller=%p\n", __FUNCTION__, - p, p->rt_refcnt, __builtin_return_address(0)); } static inline void @@ -580,31 +797,40 @@ ifaref(struct ifaddr *ifa) * destination to go through the given gateway. * Normally called as a result of a routing redirect * message from the network layer. - * - * N.B.: must be called at splnet - * */ void -rtredirect(struct sockaddr *dst, struct sockaddr *gateway, - struct sockaddr *netmask, int flags, struct sockaddr *src, - struct rtentry **rtp) +rtredirect(struct ifnet *ifp, struct sockaddr *dst, struct sockaddr *gateway, + struct sockaddr *netmask, int flags, struct sockaddr *src, + struct rtentry **rtp) { - struct rtentry *rt; + struct rtentry *rt = NULL; int error = 0; short *stat = 0; struct rt_addrinfo info; struct ifaddr *ifa = NULL; + unsigned int ifscope = (ifp != NULL) ? ifp->if_index : IFSCOPE_NONE; + struct sockaddr_in sin; lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_NOTOWNED); lck_mtx_lock(rt_mtx); - /* verify the gateway is directly reachable */ - if ((ifa = ifa_ifwithnet(gateway)) == 0) { + /* + * Verify the gateway is directly reachable; if scoped routing + * is enabled, verify that it is reachable from the interface + * where the ICMP redirect arrived on. + */ + if ((ifa = ifa_ifwithnet_scoped(gateway, ifscope)) == NULL) { error = ENETUNREACH; goto out; } - rt = rtalloc1_locked(dst, 0, RTF_CLONING | RTF_PRCLONING); + /* Lookup route to the destination (from the original IP header) */ + rt = rtalloc1_scoped_locked(dst, 0, RTF_CLONING|RTF_PRCLONING, ifscope); + + /* Embed scope in src for comparison against rt_gateway below */ + if (ip_doscopedroute && src->sa_family == AF_INET) + src = sin_copy(SIN(src), &sin, ifscope); + /* * If the redirect isn't from our current router for this dst, * it's either old or wrong. If it redirects us to ourselves, @@ -647,13 +873,14 @@ rtredirect(struct sockaddr *dst, struct sockaddr *gateway, if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) { /* * Changing from route to net => route to host. - * Create new route, rather than smashing route to net. + * Create new route, rather than smashing route + * to net; similar to cloned routes, the newly + * created host route is scoped as well. */ create: flags |= RTF_GATEWAY | RTF_DYNAMIC; - error = rtrequest_locked((int)RTM_ADD, dst, gateway, - netmask, flags, - (struct rtentry **)0); + error = rtrequest_scoped_locked(RTM_ADD, dst, + gateway, netmask, flags, NULL, ifscope); stat = &rtstat.rts_dynamic; } else { /* @@ -666,10 +893,11 @@ rtredirect(struct sockaddr *dst, struct sockaddr *gateway, /* * add the key and gateway (in one malloc'd chunk). */ - rt_setgate(rt, rt_key(rt), gateway); + error = rt_setgate(rt, rt_key(rt), gateway); } - } else + } else { error = EHOSTUNREACH; + } done: if (rt) { if (rtp && !error) @@ -678,10 +906,14 @@ done: rtfree_locked(rt); } out: - if (error) + if (error) { rtstat.rts_badredirect++; - else if (stat != NULL) - (*stat)++; + } else { + if (stat != NULL) + (*stat)++; + if (use_routegenid) + route_generation++; + } bzero((caddr_t)&info, sizeof(info)); info.rti_info[RTAX_DST] = dst; info.rti_info[RTAX_GATEWAY] = gateway; @@ -721,16 +953,47 @@ ifa_ifwithroute( } struct ifaddr * -ifa_ifwithroute_locked( - int flags, - const struct sockaddr *dst, - const struct sockaddr *gateway) +ifa_ifwithroute_locked(int flags, const struct sockaddr *dst, + const struct sockaddr *gateway) +{ + return (ifa_ifwithroute_common_locked((flags & ~RTF_IFSCOPE), dst, + gateway, IFSCOPE_NONE)); +} + +struct ifaddr * +ifa_ifwithroute_scoped_locked(int flags, const struct sockaddr *dst, + const struct sockaddr *gateway, unsigned int ifscope) +{ + if (ifscope != IFSCOPE_NONE) + flags |= RTF_IFSCOPE; + else + flags &= ~RTF_IFSCOPE; + + return (ifa_ifwithroute_common_locked(flags, dst, gateway, ifscope)); +} + +static struct ifaddr * +ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst, + const struct sockaddr *gateway, unsigned int ifscope) { struct ifaddr *ifa = NULL; struct rtentry *rt = NULL; + struct sockaddr_in dst_in, gw_in; lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + if (ip_doscopedroute) { + /* + * Just in case the sockaddr passed in by the caller + * contains embedded scope, make sure to clear it since + * IPv4 interface addresses aren't scoped. + */ + if (dst != NULL && dst->sa_family == AF_INET) + dst = sin_copy(SIN(dst), &dst_in, IFSCOPE_NONE); + if (gateway != NULL && gateway->sa_family == AF_INET) + gateway = sin_copy(SIN(gateway), &gw_in, IFSCOPE_NONE); + } + if (!(flags & RTF_GATEWAY)) { /* * If we are adding a route to an interface, @@ -743,7 +1006,7 @@ ifa_ifwithroute_locked( ifa = ifa_ifwithdstaddr(dst); } if (ifa == NULL) - ifa = ifa_ifwithaddr(gateway); + ifa = ifa_ifwithaddr_scoped(gateway, ifscope); } else { /* * If we are adding a route to a remote net @@ -753,10 +1016,11 @@ ifa_ifwithroute_locked( ifa = ifa_ifwithdstaddr(gateway); } if (ifa == NULL) - ifa = ifa_ifwithnet(gateway); + ifa = ifa_ifwithnet_scoped(gateway, ifscope); if (ifa == NULL) { /* Workaround to avoid gcc warning regarding const variable */ - rt = rtalloc1_locked((struct sockaddr *)(size_t)dst, 0, 0UL); + rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)dst, + 0, 0UL, ifscope); if (rt != NULL) { ifa = rt->rt_ifa; if (ifa != NULL) @@ -784,8 +1048,8 @@ ifa_ifwithroute_locked( */ if ((ifa == NULL || !equal(ifa->ifa_addr, (struct sockaddr *)(size_t)gateway)) && - (rt = rtalloc1_locked((struct sockaddr *)(size_t)gateway, - 0, 0UL)) != NULL) { + (rt = rtalloc1_scoped_locked((struct sockaddr *)(size_t)gateway, + 0, 0UL, ifscope)) != NULL) { if (ifa != NULL) ifafree(ifa); ifa = rt->rt_ifa; @@ -793,6 +1057,17 @@ ifa_ifwithroute_locked( ifaref(ifa); rtunref(rt); } + /* + * If an interface scope was specified, the interface index of + * the found ifaddr must be equivalent to that of the scope; + * otherwise there is no match. + */ + if ((flags & RTF_IFSCOPE) && + ifa != NULL && ifa->ifa_ifp->if_index != ifscope) { + ifafree(ifa); + ifa = NULL; + } + return (ifa); } @@ -806,25 +1081,55 @@ struct rtfc_arg { struct radix_node_head *rnh; }; +int +rtrequest_locked(int req, struct sockaddr *dst, struct sockaddr *gateway, + struct sockaddr *netmask, int flags, struct rtentry **ret_nrt) +{ + return (rtrequest_common_locked(req, dst, gateway, netmask, + (flags & ~RTF_IFSCOPE), ret_nrt, IFSCOPE_NONE)); +} + +int +rtrequest_scoped_locked(int req, struct sockaddr *dst, + struct sockaddr *gateway, struct sockaddr *netmask, int flags, + struct rtentry **ret_nrt, unsigned int ifscope) +{ + if (ifscope != IFSCOPE_NONE) + flags |= RTF_IFSCOPE; + else + flags &= ~RTF_IFSCOPE; + + return (rtrequest_common_locked(req, dst, gateway, netmask, + flags, ret_nrt, ifscope)); +} + /* - * Do appropriate manipulations of a routing tree given - * all the bits of info needed + * Do appropriate manipulations of a routing tree given all the bits of + * info needed. + * + * Embedding the scope in the radix key is an internal job that should be + * left to routines in this module. Callers should specify the scope value + * to the "scoped" variants of route routines instead of manipulating the + * key itself. This is typically done when creating a scoped route, e.g. + * rtrequest(RTM_ADD). Once such a route is created and marked with the + * RTF_IFSCOPE flag, callers can simply use its rt_key(rt) to clone it + * (RTM_RESOLVE) or to remove it (RTM_DELETE). An exception to this is + * during certain routing socket operations where the search key might be + * derived from the routing message itself, in which case the caller must + * specify the destination address and scope value for RTM_ADD/RTM_DELETE. */ -int -rtrequest_locked( - int req, - struct sockaddr *dst, - struct sockaddr *gateway, - struct sockaddr *netmask, - int flags, - struct rtentry **ret_nrt) +static int +rtrequest_common_locked(int req, struct sockaddr *dst0, + struct sockaddr *gateway, struct sockaddr *netmask, int flags, + struct rtentry **ret_nrt, unsigned int ifscope) { int error = 0; struct rtentry *rt; struct radix_node *rn; struct radix_node_head *rnh; struct ifaddr *ifa = NULL; - struct sockaddr *ndst; + struct sockaddr *ndst, *dst = dst0; + struct sockaddr_in sin, mask; #define senderr(x) { error = x ; goto bad; } lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); @@ -839,6 +1144,37 @@ rtrequest_locked( */ if (flags & RTF_HOST) netmask = 0; + + /* + * If RTF_IFSCOPE is specified, use a local copy of the destination + * address to embed the scope into. This logic is repeated below + * in the RTM_RESOLVE handler since the caller does not normally + * specify such a flag during a resolve; instead it passes in the + * route used for cloning for which the scope info is derived from. + * Note also that in the case of RTM_DELETE, the address passed in + * by the caller might already contain the embedded scope info when + * it is the key itself, thus making RTF_IFSCOPE unnecessary; one + * instance where it is explicitly set is inside route_output() + * as part of handling a routing socket request. + */ + if (req != RTM_RESOLVE && (flags & RTF_IFSCOPE)) { + /* Scoped routing is for AF_INET only */ + if (dst->sa_family != AF_INET || + (req == RTM_ADD && !ip_doscopedroute)) + senderr(EINVAL); + + if (ifscope == IFSCOPE_NONE) { + flags &= ~RTF_IFSCOPE; + } else { + /* Embed ifscope into the key (local copy) */ + dst = sin_copy(SIN(dst), &sin, ifscope); + + /* Embed ifscope into netmask (local copy) */ + if (netmask != NULL) + netmask = mask_copy(netmask, &mask, ifscope); + } + } + switch (req) { case RTM_DELETE: /* @@ -901,6 +1237,13 @@ rtrequest_locked( (struct rtentry_dbg *)rt, rtd_trash_link); } + /* + * If this is the (non-scoped) default route, clear + * the interface index used for the primary ifscope. + */ + if (rt_inet_default(rt, rt_key(rt))) + set_primary_ifscope(IFSCOPE_NONE); + /* * If the caller wants it, then it can have it, * but it's up to it to free the rtentry as we won't be @@ -926,20 +1269,54 @@ rtrequest_locked( gateway = rt->rt_gateway; if ((netmask = rt->rt_genmask) == 0) flags |= RTF_HOST; + + if (!ip_doscopedroute || dst->sa_family != AF_INET) + goto makeroute; + /* + * When scoped routing is enabled, cloned entries are + * always scoped according to the interface portion of + * the parent route. The exception to this are IPv4 + * link local addresses. + */ + if (!IN_LINKLOCAL(ntohl(SIN(dst)->sin_addr.s_addr))) { + if (flags & RTF_IFSCOPE) { + ifscope = sa_get_ifscope(rt_key(rt)); + } else { + ifscope = rt->rt_ifp->if_index; + flags |= RTF_IFSCOPE; + } + } else { + ifscope = IFSCOPE_NONE; + flags &= ~RTF_IFSCOPE; + } + + /* Embed or clear ifscope into/from the key (local copy) */ + dst = sin_copy(SIN(dst), &sin, ifscope); + + /* Embed or clear ifscope into/from netmask (local copy) */ + if (netmask != NULL) + netmask = mask_copy(netmask, &mask, ifscope); + goto makeroute; case RTM_ADD: if ((flags & RTF_GATEWAY) && !gateway) - panic("rtrequest: GATEWAY but no gateway"); + panic("rtrequest: RTF_GATEWAY but no gateway"); - if ((ifa = ifa_ifwithroute_locked(flags, dst, gateway)) == 0) + if (flags & RTF_IFSCOPE) { + ifa = ifa_ifwithroute_scoped_locked(flags, dst0, + gateway, ifscope); + } else { + ifa = ifa_ifwithroute_locked(flags, dst0, gateway); + } + if (ifa == NULL) senderr(ENETUNREACH); - - makeroute: +makeroute: if ((rt = rte_alloc()) == NULL) senderr(ENOBUFS); Bzero(rt, sizeof(*rt)); rt->rt_flags = RTF_UP | flags; + /* * Add the gateway. Possibly re-malloc-ing the storage for it * also add the rt_gwroute if possible. @@ -957,9 +1334,9 @@ rtrequest_locked( /* * make sure it contains the value we want (masked if needed). */ - if (netmask) { + if (netmask) rt_maskedcopy(dst, ndst, netmask); - } else + else Bcopy(dst, ndst, dst->sa_len); /* @@ -983,8 +1360,13 @@ rtrequest_locked( * mechanism, then we just blow it away and retry * the insertion of the new one. */ - rt2 = rtalloc1_locked(dst, 0, - RTF_CLONING | RTF_PRCLONING); + if (flags & RTF_IFSCOPE) { + rt2 = rtalloc1_scoped_locked(dst0, 0, + RTF_CLONING | RTF_PRCLONING, ifscope); + } else { + rt2 = rtalloc1_locked(dst, 0, + RTF_CLONING | RTF_PRCLONING); + } if (rt2 && rt2->rt_parent) { rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt2), @@ -1052,6 +1434,13 @@ rtrequest_locked( rt_fixchange, &arg); } + /* + * If this is the (non-scoped) default route, record + * the interface index used for the primary ifscope. + */ + if (rt_inet_default(rt, rt_key(rt))) + set_primary_ifscope(rt->rt_ifp->if_index); + /* * actually return a resultant rtentry and * give the caller a single reference. @@ -1121,10 +1510,6 @@ rt_fixdelete(struct radix_node *rn, void *vp) * routine just for adds. I'm not sure why I thought it was necessary to do * changes this way. */ -#ifdef DEBUG -static int rtfcdebug = 0; -#endif - static int rt_fixchange(struct radix_node *rn, void *vp) { @@ -1135,36 +1520,20 @@ rt_fixchange(struct radix_node *rn, void *vp) u_char *xk1, *xm1, *xk2, *xmp; int i, len, mlen; -#ifdef DEBUG - if (rtfcdebug) - printf("rt_fixchange: rt %p, rt0 %p\n", rt, rt0); -#endif - lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); if (!rt->rt_parent || - (rt->rt_flags & (RTF_PINNED | RTF_CLONING | RTF_PRCLONING))) { -#ifdef DEBUG - if(rtfcdebug) printf("no parent or pinned\n"); -#endif - return 0; - } + (rt->rt_flags & (RTF_PINNED | RTF_CLONING | RTF_PRCLONING))) + return (0); - if (rt->rt_parent == rt0) { -#ifdef DEBUG - if(rtfcdebug) printf("parent match\n"); -#endif - return rtrequest_locked(RTM_DELETE, rt_key(rt), - (struct sockaddr *)0, rt_mask(rt), - rt->rt_flags, (struct rtentry **)0); - } + if (rt->rt_parent == rt0) + goto delete_rt; /* * There probably is a function somewhere which does this... * if not, there should be. */ - len = imin(((struct sockaddr *)rt_key(rt0))->sa_len, - ((struct sockaddr *)rt_key(rt))->sa_len); + len = imin(rt_key(rt0)->sa_len, rt_key(rt)->sa_len); xk1 = (u_char *)rt_key(rt0); xm1 = (u_char *)rt_mask(rt0); @@ -1172,140 +1541,168 @@ rt_fixchange(struct radix_node *rn, void *vp) /* avoid applying a less specific route */ xmp = (u_char *)rt_mask(rt->rt_parent); - mlen = ((struct sockaddr *)rt_key(rt->rt_parent))->sa_len; - if (mlen > ((struct sockaddr *)rt_key(rt0))->sa_len) { -#if DEBUG - if (rtfcdebug) - printf("rt_fixchange: inserting a less " - "specific route\n"); -#endif - return 0; - } + mlen = rt_key(rt->rt_parent)->sa_len; + if (mlen > rt_key(rt0)->sa_len) + return (0); + for (i = rnh->rnh_treetop->rn_offset; i < mlen; i++) { - if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i]) { -#if DEBUG - if (rtfcdebug) - printf("rt_fixchange: inserting a less " - "specific route\n"); -#endif - return 0; - } + if ((xmp[i] & ~(xmp[i] ^ xm1[i])) != xmp[i]) + return (0); } for (i = rnh->rnh_treetop->rn_offset; i < len; i++) { - if ((xk2[i] & xm1[i]) != xk1[i]) { -#ifdef DEBUG - if(rtfcdebug) printf("no match\n"); -#endif - return 0; - } + if ((xk2[i] & xm1[i]) != xk1[i]) + return (0); } /* * OK, this node is a clone, and matches the node currently being * changed/added under the node's mask. So, get rid of it. */ -#ifdef DEBUG - if(rtfcdebug) printf("deleting\n"); -#endif - return rtrequest_locked(RTM_DELETE, rt_key(rt), (struct sockaddr *)0, - rt_mask(rt), rt->rt_flags, (struct rtentry **)0); +delete_rt: + return (rtrequest_locked(RTM_DELETE, rt_key(rt), NULL, + rt_mask(rt), rt->rt_flags, NULL)); } int -rt_setgate(struct rtentry *rt0, struct sockaddr *dst, struct sockaddr *gate) +rt_setgate(struct rtentry *rt, struct sockaddr *dst, struct sockaddr *gate) { - caddr_t new, old; int dlen = ROUNDUP(dst->sa_len), glen = ROUNDUP(gate->sa_len); - struct rtentry *rt = rt0; struct radix_node_head *rnh = rt_tables[dst->sa_family]; + + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + /* * A host route with the destination equal to the gateway * will interfere with keeping LLINFO in the routing * table, so disallow it. */ - - lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); - - if (((rt0->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) == - (RTF_HOST|RTF_GATEWAY)) && - (dst->sa_len == gate->sa_len) && + if (((rt->rt_flags & (RTF_HOST|RTF_GATEWAY|RTF_LLINFO)) == + (RTF_HOST|RTF_GATEWAY)) && (dst->sa_len == gate->sa_len) && (bcmp(dst, gate, dst->sa_len) == 0)) { /* * The route might already exist if this is an RTM_CHANGE * or a routing redirect, so try to delete it. */ - if (rt_key(rt0)) - rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt0), - rt0->rt_gateway, rt_mask(rt0), rt0->rt_flags, 0); - return EADDRNOTAVAIL; + if (rt_key(rt)) + rtrequest_locked(RTM_DELETE, rt_key(rt), + rt->rt_gateway, rt_mask(rt), rt->rt_flags, NULL); + return (EADDRNOTAVAIL); } /* - * Both dst and gateway are stored in the same malloc'd chunk - * (If I ever get my hands on....) - * if we need to malloc a new chunk, then keep the old one around - * till we don't need it any more. + * The destination is not directly reachable. Get a route + * to the next-hop gateway and store it in rt_gwroute. */ - if (rt->rt_gateway == 0 || glen > ROUNDUP(rt->rt_gateway->sa_len)) { - old = (caddr_t)rt_key(rt); - R_Malloc(new, caddr_t, dlen + glen); - if (new == 0) - return ENOBUFS; - rt->rt_nodes->rn_key = new; - } else { + if (rt->rt_flags & RTF_GATEWAY) { + struct rtentry *gwrt; + unsigned int ifscope; + + ifscope = (dst->sa_family == AF_INET) ? + sa_get_ifscope(dst) : IFSCOPE_NONE; + + gwrt = rtalloc1_scoped_locked(gate, 1, RTF_PRCLONING, ifscope); + /* - * otherwise just overwrite the old one + * Cloning loop avoidance: + * + * In the presence of protocol-cloning and bad configuration, + * it is possible to get stuck in bottomless mutual recursion + * (rtrequest rt_setgate rtalloc1). We avoid this by not + * allowing protocol-cloning to operate for gateways (which + * is probably the correct choice anyway), and avoid the + * resulting reference loops by disallowing any route to run + * through itself as a gateway. This is obviously mandatory + * when we get rt->rt_output(). It implies that a route to + * the gateway must already be present in the system in order + * for the gateway to be referred to by another route. */ - new = rt->rt_nodes->rn_key; - old = 0; + if (gwrt == rt) { + rtunref(gwrt); + return (EADDRINUSE); /* failure */ + } + + /* If scoped, the gateway route must use the same interface */ + if (ifscope != IFSCOPE_NONE && (rt->rt_flags & RTF_IFSCOPE) && + gwrt != NULL && gwrt->rt_ifp != NULL && + gwrt->rt_ifp->if_index != ifscope) { + rtfree_locked(gwrt); + return ((rt->rt_flags & RTF_HOST) ? + EHOSTUNREACH : ENETUNREACH); + } + + if (rt->rt_gwroute != NULL) + rtfree_locked(rt->rt_gwroute); + rt->rt_gwroute = gwrt; + + /* + * In case the (non-scoped) default route gets modified via + * an ICMP redirect, record the interface index used for the + * primary ifscope. Also done in rt_setif() to take care + * of the non-redirect cases. + */ + if (rt_inet_default(rt, dst) && rt->rt_ifp != NULL) + set_primary_ifscope(rt->rt_ifp->if_index); + + /* + * Tell the kernel debugger about the new default gateway + * if the gateway route uses the primary interface, or + * if we are in a transient state before the non-scoped + * default gateway is installed (similar to how the system + * was behaving in the past). In future, it would be good + * to do all this only when KDP is enabled. + */ + if ((dst->sa_family == AF_INET) && + gwrt != NULL && gwrt->rt_gateway->sa_family == AF_LINK && + (gwrt->rt_ifp->if_index == get_primary_ifscope() || + get_primary_ifscope() == IFSCOPE_NONE)) + kdp_set_gateway_mac(SDL(gwrt->rt_gateway)->sdl_data); } /* - * copy the new gateway value into the memory chunk + * Prepare to store the gateway in rt_gateway. Both dst and gateway + * are stored one after the other in the same malloc'd chunk. If we + * have room, reuse the old buffer since rt_gateway already points + * to the right place. Otherwise, malloc a new block and update + * the 'dst' address and point rt_gateway to the right place. */ - Bcopy(gate, (rt->rt_gateway = (struct sockaddr *)(new + dlen)), glen); + if (rt->rt_gateway == NULL || glen > ROUNDUP(rt->rt_gateway->sa_len)) { + caddr_t new; - /* - * if we are replacing the chunk (or it's new) we need to - * replace the dst as well - */ - if (old) { + /* The underlying allocation is done with M_WAITOK set */ + R_Malloc(new, caddr_t, dlen + glen); + if (new == NULL) { + if (rt->rt_gwroute != NULL) + rtfree_locked(rt->rt_gwroute); + rt->rt_gwroute = NULL; + return (ENOBUFS); + } + + /* + * Copy from 'dst' and not rt_key(rt) because we can get + * here to initialize a newly allocated route entry, in + * which case rt_key(rt) is NULL (and so does rt_gateway). + */ Bcopy(dst, new, dlen); - R_Free(old); + R_Free(rt_key(rt)); /* free old block; NULL is okay */ + rt->rt_nodes->rn_key = new; + rt->rt_gateway = (struct sockaddr *)(new + dlen); } /* - * If there is already a gwroute, it's now almost definitly wrong - * so drop it. + * Copy the new gateway value into the memory chunk. */ - if (rt->rt_gwroute) { - rt = rt->rt_gwroute; rtfree_locked(rt); - rt = rt0; rt->rt_gwroute = 0; - } + Bcopy(gate, rt->rt_gateway, glen); + /* - * Cloning loop avoidance: - * In the presence of protocol-cloning and bad configuration, - * it is possible to get stuck in bottomless mutual recursion - * (rtrequest rt_setgate rtalloc1). We avoid this by not allowing - * protocol-cloning to operate for gateways (which is probably the - * correct choice anyway), and avoid the resulting reference loops - * by disallowing any route to run through itself as a gateway. - * This is obviously mandatory when we get rt->rt_output(). + * For consistency between rt_gateway and rt_key(gwrt). */ - if (rt->rt_flags & RTF_GATEWAY) { - rt->rt_gwroute = rtalloc1_locked(gate, 1, RTF_PRCLONING); - if (rt->rt_gwroute == rt) { - rtfree_locked(rt->rt_gwroute); - rt->rt_gwroute = 0; - return EDQUOT; /* failure */ - } - /* Tell the kernel debugger about the new default gateway */ - if ((AF_INET == rt->rt_gateway->sa_family) && - rt->rt_gwroute && rt->rt_gwroute->rt_gateway && - (AF_LINK == rt->rt_gwroute->rt_gateway->sa_family)) { - kdp_set_gateway_mac(((struct sockaddr_dl *)rt0->rt_gwroute->rt_gateway)->sdl_data); - } + if ((rt->rt_flags & RTF_GATEWAY) && rt->rt_gwroute != NULL && + (rt->rt_gwroute->rt_flags & RTF_IFSCOPE) && + rt->rt_gateway->sa_family == AF_INET && + rt_key(rt->rt_gwroute)->sa_family == AF_INET) { + sa_set_ifscope(rt->rt_gateway, + sa_get_ifscope(rt_key(rt->rt_gwroute))); } /* @@ -1318,10 +1715,10 @@ rt_setgate(struct rtentry *rt0, struct sockaddr *dst, struct sockaddr *gate) arg.rnh = rnh; arg.rt0 = rt; rnh->rnh_walktree_from(rnh, rt_key(rt), rt_mask(rt), - rt_fixchange, &arg); + rt_fixchange, &arg); } - return 0; + return (0); } static void @@ -1344,6 +1741,202 @@ rt_maskedcopy(struct sockaddr *src, struct sockaddr *dst, bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2)); } +/* + * Lookup an AF_INET scoped or non-scoped route depending on the ifscope + * value passed in by the caller (IFSCOPE_NONE implies non-scoped). + */ +static struct radix_node * +node_lookup(struct sockaddr *dst, struct sockaddr *netmask, + unsigned int ifscope) +{ + struct radix_node_head *rnh = rt_tables[AF_INET]; + struct radix_node *rn; + struct sockaddr_in sin, mask; + struct matchleaf_arg ma = { ifscope }; + rn_matchf_t *f = rn_match_ifscope; + void *w = &ma; + + if (dst->sa_family != AF_INET) + return (NULL); + + /* + * Embed ifscope into the search key; for a non-scoped + * search this will clear out any embedded scope value. + */ + dst = sin_copy(SIN(dst), &sin, ifscope); + + /* Embed (or clear) ifscope into netmask */ + if (netmask != NULL) + netmask = mask_copy(netmask, &mask, ifscope); + + if (ifscope == IFSCOPE_NONE) + f = w = NULL; + + rn = rnh->rnh_lookup_args(dst, netmask, rnh, f, w); + if (rn != NULL && (rn->rn_flags & RNF_ROOT)) + rn = NULL; + + return (rn); +} + +/* + * Lookup the AF_INET non-scoped default route. + */ +static struct radix_node * +node_lookup_default(void) +{ + struct radix_node_head *rnh = rt_tables[AF_INET]; + return (rnh->rnh_lookup(&sin_def, NULL, rnh)); +} + +/* + * Common routine to lookup/match a route. It invokes the lookup/matchaddr + * callback which could be address family-specific. The main difference + * between the two (at least for AF_INET/AF_INET6) is that a lookup does + * not alter the expiring state of a route, whereas a match would unexpire + * or revalidate the route. + * + * The optional scope or interface index property of a route allows for a + * per-interface route instance. This permits multiple route entries having + * the same destination (but not necessarily the same gateway) to exist in + * the routing table; each of these entries is specific to the corresponding + * interface. This is made possible by embedding the scope value into the + * radix key, thus making each route entry unique. These scoped entries + * exist along with the regular, non-scoped entries in the same radix tree + * for a given address family (currently AF_INET only); the scope logically + * partitions it into multiple per-interface sub-trees. + * + * When a scoped route lookup is performed, the routing table is searched for + * the best match that would result in a route using the same interface as the + * one associated with the scope (the exception to this are routes that point + * to the loopback interface). The search rule follows the longest matching + * prefix with the additional interface constraint. + */ +struct rtentry * +rt_lookup(boolean_t lookup_only, struct sockaddr *dst, struct sockaddr *netmask, + struct radix_node_head *rnh, unsigned int ifscope) +{ + struct radix_node *rn0, *rn; + boolean_t dontcare = (ifscope == IFSCOPE_NONE); + + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + + if (!lookup_only) + netmask = NULL; + + /* + * Non-scoped route lookup. + */ + if (!ip_doscopedroute || dst->sa_family != AF_INET) { + if (lookup_only) + rn = rnh->rnh_lookup(dst, netmask, rnh); + else + rn = rnh->rnh_matchaddr(dst, rnh); + goto done; + } + + /* + * Scoped route lookup: + * + * We first perform a non-scoped lookup for the original result. + * Afterwards, depending on whether or not the caller has specified + * a scope, we perform a more specific scoped search and fallback + * to this original result upon failure. + */ + rn0 = rn = node_lookup(dst, netmask, IFSCOPE_NONE); + + /* + * If the caller did not specify a scope, use the primary scope + * derived from the system's non-scoped default route. If, for + * any reason, there is no primary interface, return what we have. + */ + if (dontcare && (ifscope = get_primary_ifscope()) == IFSCOPE_NONE) + goto validate; + + /* + * Keep the original result if either of the following is true: + * + * 1) The interface portion of the route has the same interface + * index as the scope value and it is marked with RTF_IFSCOPE. + * 2) The route uses the loopback interface, in which case the + * destination (host/net) is local/loopback. + * + * Otherwise, do a more specified search using the scope. + */ + if (rn != NULL) { + struct rtentry *rt = RT(rn); + if (rt->rt_ifp != lo_ifp) { + if (rt->rt_ifp->if_index != ifscope) { + /* + * Wrong interface; keep the original result + * only if the caller did not specify a scope, + * and do a more specific scoped search using + * the scope of the found route. Otherwise, + * start again from scratch. + */ + rn = NULL; + if (dontcare) + ifscope = rt->rt_ifp->if_index; + else + rn0 = NULL; + } else if (!(rt->rt_flags & RTF_IFSCOPE)) { + /* + * Right interface, except that this route + * isn't marked with RTF_IFSCOPE. Do a more + * specific scoped search. Keep the original + * result and return it it in case the scoped + * search fails. + */ + rn = NULL; + } + } + } + + /* + * Scoped search. Find the most specific entry having the same + * interface scope as the one requested. The following will result + * in searching for the longest prefix scoped match. + */ + if (rn == NULL) + rn = node_lookup(dst, netmask, ifscope); + + /* + * Use the original result if either of the following is true: + * + * 1) The scoped search did not yield any result. + * 2) The result from the scoped search is a scoped default route, + * and the original (non-scoped) result is not a default route, + * i.e. the original result is a more specific host/net route. + * 3) The scoped search yielded a net route but the original + * result is a host route, i.e. the original result is treated + * as a more specific route. + */ + if (rn == NULL || (rn0 != NULL && + ((INET_DEFAULT(rt_key(RT(rn))) && !INET_DEFAULT(rt_key(RT(rn0)))) || + (!RT_HOST(rn) && RT_HOST(rn0))))) + rn = rn0; + + /* + * If we still don't have a route, use the non-scoped default + * route as long as the interface portion satistifes the scope. + */ + if (rn == NULL && (rn = node_lookup_default()) != NULL && + RT(rn)->rt_ifp->if_index != ifscope) + rn = NULL; + +validate: + if (rn != NULL && !lookup_only) + (void) in_validate(rn); + +done: + if (rn != NULL && (rn->rn_flags & RNF_ROOT)) + rn = NULL; + else if (rn != NULL) + rtref(RT(rn)); + + return (RT(rn)); +} + /* * Set up a routing table entry, normally * for an interface. diff --git a/bsd/net/route.h b/bsd/net/route.h index 9d26a8bbd..cfc95a6aa 100644 --- a/bsd/net/route.h +++ b/bsd/net/route.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -84,8 +84,12 @@ struct rtentry; struct route { struct rtentry *ro_rt; struct sockaddr ro_dst; - u_long reserved[2]; /* for future use if needed */ + u_int32_t ro_flags; /* route flags (see below) */ + u_int32_t reserved; /* for future use if needed */ }; + +#define ROF_SRCIF_SELECTED 0x1 /* source interface was selected */ + #else struct route; #endif /* PRIVATE */ @@ -195,8 +199,8 @@ struct ortentry { #define RTF_LOCAL 0x200000 /* route represents a local address */ #define RTF_BROADCAST 0x400000 /* route represents a bcast address */ #define RTF_MULTICAST 0x800000 /* route represents a mcast address */ -#define RTF_TRACKREFS 0x1000000 /* Debug references and releases */ - /* 0x1000000 and up unassigned */ +#define RTF_IFSCOPE 0x1000000 /* has valid interface scope */ + /* 0x2000000 and up unassigned */ /* * Routing statistics. @@ -323,6 +327,11 @@ struct route_cb { }; #ifdef KERNEL_PRIVATE +/* + * For scoped routing; a zero interface scope value means nil/no scope. + */ +#define IFSCOPE_NONE 0 + #define RTFREE(rt) rtfree(rt) extern struct route_cb route_cb; extern struct radix_node_head *rt_tables[AF_MAX+1]; @@ -338,11 +347,19 @@ extern void rt_missmsg(int, struct rt_addrinfo *, int, int); extern void rt_newaddrmsg(int, struct ifaddr *, int, struct rtentry *); extern void rt_newmaddrmsg(int, struct ifmultiaddr *); extern int rt_setgate(struct rtentry *, struct sockaddr *, struct sockaddr *); +extern void set_primary_ifscope(unsigned int); +extern unsigned int get_primary_ifscope(void); +extern boolean_t rt_inet_default(struct rtentry *, struct sockaddr *); +extern struct rtentry *rt_lookup(boolean_t, struct sockaddr *, + struct sockaddr *, struct radix_node_head *, unsigned int); extern void rtalloc(struct route *); extern void rtalloc_ign(struct route *, u_long); -extern void rtalloc_ign_locked(struct route *, u_long ); +extern void rtalloc_ign_locked(struct route *, u_long); +extern void rtalloc_scoped_ign_locked(struct route *, u_long, unsigned int); extern struct rtentry *rtalloc1(struct sockaddr *, int, u_long); extern struct rtentry *rtalloc1_locked(struct sockaddr *, int, u_long); +extern struct rtentry *rtalloc1_scoped_locked(struct sockaddr *, int, + u_long, unsigned int); extern void rtfree(struct rtentry *); extern void rtfree_locked(struct rtentry *); extern void rtref(struct rtentry *); @@ -356,14 +373,17 @@ extern void rtsetifa(struct rtentry *, struct ifaddr *); extern int rtinit(struct ifaddr *, int, int); extern int rtinit_locked(struct ifaddr *, int, int); extern int rtioctl(int, caddr_t, struct proc *); -extern void rtredirect(struct sockaddr *, struct sockaddr *, +extern void rtredirect(struct ifnet *, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct sockaddr *, struct rtentry **); extern int rtrequest(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); extern int rtrequest_locked(int, struct sockaddr *, struct sockaddr *, struct sockaddr *, int, struct rtentry **); +extern int rtrequest_scoped_locked(int, struct sockaddr *, struct sockaddr *, + struct sockaddr *, int, struct rtentry **, unsigned int); extern struct rtentry *rte_alloc(void); extern void rte_free(struct rtentry *); +extern unsigned int sa_get_ifscope(struct sockaddr *); #endif KERNEL_PRIVATE #endif diff --git a/bsd/net/rtsock.c b/bsd/net/rtsock.c index b4aee361b..b6836fd6c 100644 --- a/bsd/net/rtsock.c +++ b/bsd/net/rtsock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -83,7 +83,6 @@ #include extern struct rtstat rtstat; -extern int rttrash; extern u_long route_generation; extern int use_routegenid; extern int check_routeselfref; @@ -113,7 +112,9 @@ static int sysctl_iflist2(int af, struct walkarg *w); static int route_output(struct mbuf *, struct socket *); static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *); static void rt_setif(struct rtentry *, struct sockaddr *, struct sockaddr *, - struct sockaddr *); + struct sockaddr *, unsigned int); + +#define SIN(sa) ((struct sockaddr_in *)(size_t)(sa)) /* Sleazy use of local variables throughout file, warning!!!! */ #define dst info.rti_info[RTAX_DST] @@ -308,10 +309,13 @@ route_output(struct mbuf *m, struct socket *so) #ifndef __APPLE__ struct proc *curproc = current_proc(); #endif + struct sockaddr_in dst_in, gate_in; int sendonlytoself = 0; + unsigned int ifscope = IFSCOPE_NONE; #define senderr(e) { error = e; goto flush;} - if (m == 0 || ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == 0)) + if (m == NULL || + ((m->m_len < sizeof(long)) && (m = m_pullup(m, sizeof(long))) == 0)) return (ENOBUFS); if ((m->m_flags & M_PKTHDR) == 0) panic("route_output"); @@ -323,20 +327,20 @@ route_output(struct mbuf *m, struct socket *so) len = m->m_pkthdr.len; if (len < sizeof(*rtm) || len != mtod(m, struct rt_msghdr *)->rtm_msglen) { - dst = 0; + dst = NULL; senderr(EINVAL); } R_Malloc(rtm, struct rt_msghdr *, len); - if (rtm == 0) { - dst = 0; + if (rtm == NULL) { + dst = NULL; senderr(ENOBUFS); } m_copydata(m, 0, len, (caddr_t)rtm); if (rtm->rtm_version != RTM_VERSION) { - dst = 0; + dst = NULL; senderr(EPROTONOSUPPORT); } - + /* * Silent version of RTM_GET for Reachabiltiy APIs. We may change * all RTM_GETs to be silent in the future, so this is private for now. @@ -347,26 +351,52 @@ route_output(struct mbuf *m, struct socket *so) sendonlytoself = 1; rtm->rtm_type = RTM_GET; } - + /* * Perform permission checking, only privileged sockets * may perform operations other than RTM_GET */ if (rtm->rtm_type != RTM_GET && (so->so_state & SS_PRIV) == 0) { - dst = 0; + dst = NULL; senderr(EPERM); } rtm->rtm_pid = proc_selfpid(); info.rti_addrs = rtm->rtm_addrs; if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, &info)) { - dst = 0; + dst = NULL; senderr(EINVAL); } - if (dst == 0 || (dst->sa_family >= AF_MAX) - || (gate != 0 && (gate->sa_family >= AF_MAX))) { + if (dst == NULL || (dst->sa_family >= AF_MAX) || + (gate != NULL && (gate->sa_family >= AF_MAX))) { senderr(EINVAL); } + + if (dst->sa_family == AF_INET && dst->sa_len != sizeof (dst_in)) { + /* At minimum, we need up to sin_addr */ + if (dst->sa_len < offsetof(struct sockaddr_in, sin_zero)) + senderr(EINVAL); + bzero(&dst_in, sizeof (dst_in)); + dst_in.sin_len = sizeof (dst_in); + dst_in.sin_family = AF_INET; + dst_in.sin_port = SIN(dst)->sin_port; + dst_in.sin_addr = SIN(dst)->sin_addr; + dst = (struct sockaddr *)&dst_in; + } + + if (gate != NULL && + gate->sa_family == AF_INET && gate->sa_len != sizeof (gate_in)) { + /* At minimum, we need up to sin_addr */ + if (gate->sa_len < offsetof(struct sockaddr_in, sin_zero)) + senderr(EINVAL); + bzero(&gate_in, sizeof (gate_in)); + gate_in.sin_len = sizeof (gate_in); + gate_in.sin_family = AF_INET; + gate_in.sin_port = SIN(gate)->sin_port; + gate_in.sin_addr = SIN(gate)->sin_addr; + gate = (struct sockaddr *)&gate_in; + } + if (genmask) { struct radix_node *t; t = rn_addmask((caddr_t)genmask, 0, 1); @@ -375,10 +405,21 @@ route_output(struct mbuf *m, struct socket *so) else senderr(ENOBUFS); } + + /* + * If RTF_IFSCOPE flag is set, then rtm_index specifies the scope. + */ + if (rtm->rtm_flags & RTF_IFSCOPE) { + /* Scoped routing is for AF_INET only */ + if (dst->sa_family != AF_INET) + senderr(EINVAL); + ifscope = rtm->rtm_index; + } + switch (rtm->rtm_type) { - + case RTM_ADD: - if (gate == 0) + if (gate == NULL) senderr(EINVAL); #ifdef __APPLE__ @@ -409,8 +450,8 @@ route_output(struct mbuf *m, struct socket *so) } } #endif - error = rtrequest_locked(RTM_ADD, dst, gate, netmask, - rtm->rtm_flags, &saved_nrt); + error = rtrequest_scoped_locked(RTM_ADD, dst, gate, + netmask, rtm->rtm_flags, &saved_nrt, ifscope); if (error == 0 && saved_nrt) { #ifdef __APPLE__ /* @@ -441,21 +482,22 @@ route_output(struct mbuf *m, struct socket *so) * dwiggins@bbn.com */ - rt_setif(saved_nrt, ifpaddr, ifaaddr, gate); + rt_setif(saved_nrt, ifpaddr, ifaaddr, gate, + ifscope); #endif rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, &saved_nrt->rt_rmx); saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); saved_nrt->rt_rmx.rmx_locks |= (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); - rtunref(saved_nrt); saved_nrt->rt_genmask = genmask; + rtunref(saved_nrt); } break; case RTM_DELETE: - error = rtrequest_locked(RTM_DELETE, dst, gate, netmask, - rtm->rtm_flags, &saved_nrt); + error = rtrequest_scoped_locked(RTM_DELETE, dst, + gate, netmask, rtm->rtm_flags, &saved_nrt, ifscope); if (error == 0) { rt = saved_nrt; goto report; @@ -465,13 +507,17 @@ route_output(struct mbuf *m, struct socket *so) case RTM_GET: case RTM_CHANGE: case RTM_LOCK: - if ((rnh = rt_tables[dst->sa_family]) == 0) { + if ((rnh = rt_tables[dst->sa_family]) == NULL) senderr(EAFNOSUPPORT); - } else if ((rt = (struct rtentry *) - rnh->rnh_lookup(dst, netmask, rnh)) != NULL) - rtref(rt); - else + + /* + * Lookup the best match based on the key-mask pair; + * callee adds a reference and checks for root node. + */ + rt = rt_lookup(TRUE, dst, netmask, rnh, ifscope); + if (rt == NULL) senderr(ESRCH); + switch(rtm->rtm_type) { case RTM_GET: { @@ -534,7 +580,8 @@ route_output(struct mbuf *m, struct socket *so) * equivalent to the code found at this very spot * in BSD. */ - rt_setif(rt, ifpaddr, ifaaddr, gate); + rt_setif(rt, ifpaddr, ifaaddr, gate, + ifscope); #endif rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, @@ -638,11 +685,8 @@ rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics *out) * Set route's interface given ifpaddr, ifaaddr, and gateway. */ static void -rt_setif( - struct rtentry *rt, - struct sockaddr *Ifpaddr, - struct sockaddr *Ifaaddr, - struct sockaddr *Gate) +rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr, + struct sockaddr *Gate, unsigned int ifscope) { struct ifaddr *ifa = 0; struct ifnet *ifp = 0; @@ -653,17 +697,16 @@ rt_setif( if (use_routegenid) route_generation++; - /* new gateway could require new ifaddr, ifp; - flags may also be different; ifp may be specified - by ll sockaddr when protocol address is ambiguous */ - if (Ifpaddr && (ifa = ifa_ifwithnet(Ifpaddr)) && + /* + * New gateway could require new ifaddr, ifp; flags may also + * be different; ifp may be specified by ll sockaddr when + * protocol address is ambiguous. + */ + if (Ifpaddr && (ifa = ifa_ifwithnet_scoped(Ifpaddr, ifscope)) && (ifp = ifa->ifa_ifp) && (Ifaaddr || Gate)) { - ifafree(ifa); - ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, - ifp); - } - else - { + ifafree(ifa); + ifa = ifaof_ifpforaddr(Ifaaddr ? Ifaaddr : Gate, ifp); + } else { if (ifa) { ifafree(ifa); ifa = 0; @@ -671,32 +714,36 @@ rt_setif( if (Ifpaddr && (ifp = if_withname(Ifpaddr)) ) { if (Gate) { ifa = ifaof_ifpforaddr(Gate, ifp); - } - else { + } else { ifnet_lock_shared(ifp); ifa = TAILQ_FIRST(&ifp->if_addrhead); ifaref(ifa); ifnet_lock_done(ifp); } - } - else if (Ifaaddr && (ifa = ifa_ifwithaddr(Ifaaddr))) { + } else if (Ifaaddr && + (ifa = ifa_ifwithaddr_scoped(Ifaaddr, ifscope))) { ifp = ifa->ifa_ifp; - } - else if (Gate && (ifa = ifa_ifwithroute_locked(rt->rt_flags, - rt_key(rt), Gate))) { + } else if (Gate && + (ifa = ifa_ifwithroute_scoped_locked(rt->rt_flags, + rt_key(rt), Gate, ifscope))) { ifp = ifa->ifa_ifp; } } if (ifa) { struct ifaddr *oifa = rt->rt_ifa; if (oifa != ifa) { - if (oifa && oifa->ifa_rtrequest) - oifa->ifa_rtrequest(RTM_DELETE, - rt, Gate); + if (oifa && oifa->ifa_rtrequest) + oifa->ifa_rtrequest(RTM_DELETE, rt, Gate); rtsetifa(rt, ifa); - rt->rt_ifp = ifp; - rt->rt_rmx.rmx_mtu = ifp->if_mtu; - if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) + rt->rt_ifp = ifp; + /* + * If this is the (non-scoped) default route, record + * the interface index used for the primary ifscope. + */ + if (rt_inet_default(rt, rt_key(rt))) + set_primary_ifscope(rt->rt_ifp->if_index); + rt->rt_rmx.rmx_mtu = ifp->if_mtu; + if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); } else { ifafree(ifa); @@ -705,7 +752,7 @@ rt_setif( ifafree(ifa); return; } - call_ifareq: +call_ifareq: /* XXX: to reset gateway to correct value, at RTM_CHANGE */ if (rt->rt_ifa && rt->rt_ifa->ifa_rtrequest) rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, Gate); @@ -1311,6 +1358,7 @@ sysctl_rttrash(struct sysctl_req *req) static int sysctl_rtsock SYSCTL_HANDLER_ARGS { +#pragma unused(oidp) int *name = (int *)arg1; u_int namelen = arg2; struct radix_node_head *rnh; diff --git a/bsd/netinet/in.c b/bsd/netinet/in.c index 2663dd4c3..3a1c1bc6e 100644 --- a/bsd/netinet/in.c +++ b/bsd/netinet/in.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -785,15 +785,6 @@ in_control( */ in_ifscrub(ifp, ia, 1); ifa = &ia->ia_ifa; -#if CONFIG_FORCE_OUT_IFP - // Cleanup any pdp hack related route - if (ia->ia_route) - { - ia->ia_route->rt_flags &= ~RTF_UP; - rtfree_locked(ia->ia_route); - ia->ia_route = NULL; - } -#endif lck_mtx_unlock(rt_mtx); ifnet_lock_exclusive(ifp); if_detach_ifa(ifp, ifa); diff --git a/bsd/netinet/in.h b/bsd/netinet/in.h index 7f23a9e6a..0fcbd52d1 100644 --- a/bsd/netinet/in.h +++ b/bsd/netinet/in.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -366,7 +366,7 @@ struct sockaddr_in { sa_family_t sin_family; in_port_t sin_port; struct in_addr sin_addr; - char sin_zero[8]; /* XXX bwg2001-004 */ + char sin_zero[8]; }; #define INET_ADDRSTRLEN 16 @@ -414,7 +414,8 @@ struct ip_opts { #ifdef __APPLE__ #define IP_STRIPHDR 23 /* bool: drop receive of raw IP header */ #endif -#define IP_RECVTTL 24 /* bool; receive reception TTL w/dgram */ +#define IP_RECVTTL 24 /* bool; receive reception TTL w/dgram */ +#define IP_BOUND_IF 25 /* set/get bound interface */ #define IP_FW_ADD 40 /* add a firewall rule to chain */ @@ -441,8 +442,7 @@ struct ip_opts { #define IP_TRAFFIC_MGT_BACKGROUND 65 /* int*; get background IO flags; set background IO */ #ifdef PRIVATE -/* This is a hack, this is only a hack. */ -#define IP_FORCE_OUT_IFP 69 /* char ifname[] - send traffic on this interface */ +#define IP_FORCE_OUT_IFP 69 /* deprecated; use IP_BOUND_IF instead */ #endif /* Background socket configuration flags */ diff --git a/bsd/netinet/in_arp.c b/bsd/netinet/in_arp.c index 174aa7742..940b4bf0d 100644 --- a/bsd/netinet/in_arp.c +++ b/bsd/netinet/in_arp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2007 Apple Inc. All rights reserved. + * Copyright (c) 2004-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -372,7 +372,8 @@ arp_lookup_route( const struct in_addr *addr, int create, int proxy, - route_t *route) + route_t *route, + unsigned int ifscope) { struct sockaddr_inarp sin = {sizeof(sin), AF_INET, 0, {0}, {0}, 0, 0}; const char *why = NULL; @@ -383,8 +384,9 @@ arp_lookup_route( sin.sin_addr.s_addr = addr->s_addr; sin.sin_other = proxy ? SIN_PROXY : 0; - - *route = rtalloc1_locked((struct sockaddr*)&sin, create, 0); + + *route = rtalloc1_scoped_locked((struct sockaddr*)&sin, + create, 0, ifscope); if (*route == NULL) return ENETUNREACH; @@ -416,7 +418,7 @@ arp_lookup_route( if (why && create && log_arp_warnings) { char tmp[MAX_IPv4_STR_LEN]; - log(LOG_DEBUG, "arplookup %s failed: %s\n", + log(LOG_DEBUG, "arplookup link#%d %s failed: %s\n", ifscope, inet_ntop(AF_INET, addr, tmp, sizeof(tmp)), why); } @@ -453,7 +455,8 @@ arp_route_to_gateway_route( if ((route->rt_flags & RTF_UP) == 0) { /* route is down, find a new one */ - hint = route = rtalloc1_locked(net_dest, 1, 0); + hint = route = rtalloc1_scoped_locked(net_dest, + 1, 0, route->rt_ifp->if_index); if (hint) { rtunref(hint); } @@ -474,7 +477,9 @@ arp_route_to_gateway_route( if (route->rt_gwroute != 0) rtfree_locked(route->rt_gwroute); - route->rt_gwroute = rtalloc1_locked(route->rt_gateway, 1, 0); + route->rt_gwroute = rtalloc1_scoped_locked( + route->rt_gateway, 1, 0, + route->rt_ifp->if_index); if (route->rt_gwroute == 0) { lck_mtx_unlock(rt_mtx); return EHOSTUNREACH; @@ -560,7 +565,8 @@ arp_lookup_ip( * route and link layer information. */ if (route == NULL || route->rt_llinfo == NULL) - result = arp_lookup_route(&net_dest->sin_addr, 1, 0, &route); + result = arp_lookup_route(&net_dest->sin_addr, 1, 0, &route, + ifp->if_index); if (result || route == NULL || route->rt_llinfo == NULL) { char tmp[MAX_IPv4_STR_LEN]; @@ -706,10 +712,11 @@ arp_ip_handle_input( /* * Look up the routing entry. If it doesn't exist and we are the - * target, go ahead and create one. + * target, and the sender isn't 0.0.0.0, go ahead and create one. */ - error = arp_lookup_route(&sender_ip->sin_addr, (target_ip->sin_addr.s_addr == - best_ia->ia_addr.sin_addr.s_addr), 0, &route); + error = arp_lookup_route(&sender_ip->sin_addr, + (target_ip->sin_addr.s_addr == best_ia->ia_addr.sin_addr.s_addr && + sender_ip->sin_addr.s_addr != 0), 0, &route, ifp->if_index); if (error || route == 0 || route->rt_gateway == 0) { if (arpop != ARPOP_REQUEST) { goto respond; @@ -723,7 +730,8 @@ arp_ip_handle_input( * Verify this ARP probe doesn't conflict with an IPv4LL we know of * on another interface. */ - error = arp_lookup_route(&target_ip->sin_addr, 0, 0, &route); + error = arp_lookup_route(&target_ip->sin_addr, 0, 0, + &route, ifp->if_index); if (error == 0 && route && route->rt_gateway) { gateway = SDL(route->rt_gateway); if (route->rt_ifp != ifp && gateway->sdl_alen != 0 @@ -768,7 +776,8 @@ arp_ip_handle_input( /* don't create entry if link-local address and link-local is disabled */ if (!IN_LINKLOCAL(ntohl(sender_ip->sin_addr.s_addr)) || (ifp->if_eflags & IFEF_ARPLL) != 0) { - error = arp_lookup_route(&sender_ip->sin_addr, 1, 0, &route); + error = arp_lookup_route(&sender_ip->sin_addr, + 1, 0, &route, ifp->if_index); if (error == 0 && route != NULL && route->rt_gateway != NULL) { created_announcement = 1; } @@ -877,7 +886,8 @@ respond: if (target_ip->sin_addr.s_addr != best_ia->ia_addr.sin_addr.s_addr) { /* Find a proxy route */ - error = arp_lookup_route(&target_ip->sin_addr, 0, SIN_PROXY, &route); + error = arp_lookup_route(&target_ip->sin_addr, 0, SIN_PROXY, + &route, ifp->if_index); if (error || route == NULL) { /* We don't have a route entry indicating we should use proxy */ @@ -888,7 +898,9 @@ respond: } /* See if we have a route to the target ip before we proxy it */ - route = rtalloc1_locked((const struct sockaddr*)target_ip, 0, 0); + route = rtalloc1_scoped_locked( + (const struct sockaddr *)target_ip, 0, 0, + ifp->if_index); if (!route) { lck_mtx_unlock(rt_mtx); return 0; diff --git a/bsd/netinet/in_gif.c b/bsd/netinet/in_gif.c index 0c74fc181..787c688ce 100644 --- a/bsd/netinet/in_gif.c +++ b/bsd/netinet/in_gif.c @@ -113,6 +113,7 @@ in_gif_output( struct ip iphdr; /* capsule IP header, host byte ordered */ int proto, error; u_int8_t tos; + struct ip_out_args ipoa = { IFSCOPE_NONE }; if (sin_src == NULL || sin_dst == NULL || sin_src->sin_family != AF_INET || @@ -226,7 +227,7 @@ in_gif_output( #endif } - error = ip_output(m, NULL, &sc->gif_ro, 0, NULL, NULL); + error = ip_output(m, NULL, &sc->gif_ro, IP_OUTARGS, NULL, &ipoa); return(error); } @@ -386,7 +387,10 @@ gif_encapcheck4( sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_addr = ip.ip_src; - rt = rtalloc1((struct sockaddr *)&sin, 0, 0UL); + lck_mtx_lock(rt_mtx); + rt = rtalloc1_scoped_locked((struct sockaddr *)&sin, 0, 0, + m->m_pkthdr.rcvif->if_index); + lck_mtx_unlock(rt_mtx); if (!rt || rt->rt_ifp != m->m_pkthdr.rcvif) { #if 0 log(LOG_WARNING, "%s: packet from 0x%x dropped " diff --git a/bsd/netinet/in_pcb.c b/bsd/netinet/in_pcb.c index fce3bb78b..af785060a 100644 --- a/bsd/netinet/in_pcb.c +++ b/bsd/netinet/in_pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -313,6 +313,7 @@ in_pcblookup_local_and_cleanup( } #ifdef __APPLE_API_PRIVATE +static void in_pcb_conflict_post_msg(u_int16_t port) { /* @@ -569,77 +570,6 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p) return (0); } -#if CONFIG_FORCE_OUT_IFP -/* - * pdp_context_route_locked is losely based on rtalloc_ign_locked with - * the hope that it can be used anywhere rtalloc_ign_locked is. - */ -__private_extern__ void -pdp_context_route_locked(ifnet_t ifp, struct route *ro) -{ - struct in_ifaddr *ia; - struct rtentry *rt; - - if ((rt = ro->ro_rt) != NULL) { - if (rt->rt_ifp == ifp && rt->rt_flags & RTF_UP) - return; - - rtfree_locked(rt); - ro->ro_rt = NULL; - } - - if (ifp == NULL) - return; - - /* Find the first IP address, we will use a fake route off of that */ - TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link) { - if (ia->ia_ifp == ifp) - break; - } - - /* Hrmm no IP addresses here :( */ - if (ia == NULL) - return; - - rt = ia->ia_route; - if (rt == NULL) { - struct sockaddr *ifa = ia->ia_ifa.ifa_addr; - - /* Allocate and set up a fake route */ - if ((rt = rte_alloc()) == NULL) - return; - - bzero(rt, sizeof(*rt)); - rt->rt_flags = RTF_UP | RTF_STATIC; - if (rt_setgate(rt, ifa, ifa) != 0) { - rte_free(rt); - return; - } - /* - * Explicitly zero the key so that: - * rt_tables[rt_key(rt)->sa_family] == rt_tables[0] == NULL - */ - bzero(rt_key(rt), ifa->sa_len); - - rtsetifa(rt, &ia->ia_ifa); - rt->rt_ifp = rt->rt_ifa->ifa_ifp; - - /* Take a reference for the ia pointer to this */ - ia->ia_route = rt; - rtref(rt); - - /* - * One more rtentry floating around that is not - * linked to the routing table. - */ - (void) OSIncrementAtomic((SInt32 *)&rttrash); - } - rt->generation_id = route_generation; - rtref(rt); /* increment the reference count */ - ro->ro_rt = rt; -} -#endif - /* * Transform old in_pcbconnect() into an inner subroutine for new * in_pcbconnect(): Do some validity-checking on the remote @@ -691,8 +621,11 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, } if (inp->inp_laddr.s_addr == INADDR_ANY) { struct route *ro; + unsigned int ifscope; ia = (struct in_ifaddr *)0; + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; /* * If route is known or can be allocated now, * our src addr is taken from the i/f, else punt. @@ -718,14 +651,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = sin->sin_addr; -#if CONFIG_FORCE_OUT_IFP - /* If the socket has requested a specific interface, use that address */ - if (inp->pdp_ifp != NULL) { - pdp_context_route_locked(inp->pdp_ifp, ro); - } - else -#endif /* CONFIG_FORCE_OUT_IFP */ - rtalloc_ign_locked(ro, 0UL); + rtalloc_scoped_ign_locked(ro, 0UL, ifscope); } /* * If we found a route, use the address @@ -744,7 +670,8 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, sin->sin_port = 0; ia = ifatoia(ifa_ifwithdstaddr(sintosa(sin))); if (ia == 0) { - ia = ifatoia(ifa_ifwithnet(sintosa(sin))); + ia = ifatoia(ifa_ifwithnet_scoped(sintosa(sin), + ifscope)); } sin->sin_port = fport; if (ia == 0) { @@ -963,7 +890,6 @@ in_pcbdispose(struct inpcb *inp) so->so_saved_pcb = (caddr_t) inp; so->so_pcb = 0; inp->inp_socket = 0; - inp->reserved[0] = (u_int32_t)so; #if CONFIG_MACF_NET mac_inpcb_label_destroy(inp); #endif @@ -1699,7 +1625,6 @@ in_pcb_detach_port( in_pcbremlists(inp); inp->inp_socket = 0; - inp->reserved[0] = (u_int32_t) so; zfree(pcbinfo->ipi_zone, inp); pcbinfo->nat_dummy_socket.so_pcb = (caddr_t)pcbinfo->nat_dummy_pcb; /* restores dummypcb */ } diff --git a/bsd/netinet/in_pcb.h b/bsd/netinet/in_pcb.h index 0186e42a5..f3dec2200 100644 --- a/bsd/netinet/in_pcb.h +++ b/bsd/netinet/in_pcb.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -192,17 +192,11 @@ struct inpcb { #else void *inpcb_mtx; #endif - u_int32_t reserved[4]; /* future use (some already used) */ + unsigned int inp_boundif; /* interface scope for INP_BOUND_IF */ + u_int32_t inp_reserved[3]; /* reserved for future use */ #if CONFIG_MACF_NET struct label *inp_label; /* MAC label */ #endif -#if CONFIG_FORCE_OUT_IFP -#ifdef _KERN_SYS_KERNELTYPES_H_ - ifnet_t pdp_ifp; -#else - void *pdp_ifp; -#endif /* _KERN_SYS_KERNELTYPES_H_ */ -#endif /* CONFIG_EMBEDDED */ #if CONFIG_IP_EDGEHOLE u_int32_t inpcb_edgehole_flags; u_int32_t inpcb_edgehole_mask; @@ -448,6 +442,7 @@ struct inpcbinfo { /* XXX documentation, prefixes */ #define INP_RECVTTL 0x1000 #define INP_UDP_NOCKSUM 0x2000 /* Turn off outbound UDP checksum */ +#define INP_BOUND_IF 0x4000 /* bind socket to an ifindex */ #define IN6P_IPV6_V6ONLY 0x008000 /* restrict AF_INET6 socket for v6 */ @@ -577,9 +572,6 @@ in_pcb_rem_share_client(struct inpcbinfo *pcbinfo, u_char owner_id); void in_pcbremlists(struct inpcb *inp); int in_pcb_ckeckstate(struct inpcb *, int, int); void inpcb_to_compat(struct inpcb *inp, struct inpcb_compat *inp_compat); -#if CONFIG_FORCE_OUT_IFP -void pdp_context_route_locked(ifnet_t ifp, struct route *ro); -#endif #endif /* KERNEL */ #endif /* KERNEL_PRIVATE */ diff --git a/bsd/netinet/in_rmx.c b/bsd/netinet/in_rmx.c index 371a6db64..419befad8 100644 --- a/bsd/netinet/in_rmx.c +++ b/bsd/netinet/in_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000,2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -91,6 +91,9 @@ extern u_long route_generation; static void in_rtqtimo(void *rock); #endif +static struct radix_node *in_matroute_args(void *, struct radix_node_head *, + rn_matchf_t *f, void *); + #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ /* @@ -154,8 +157,8 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, * Find out if it is because of an * ARP entry and delete it if so. */ - rt2 = rtalloc1_locked((struct sockaddr *)sin, 0, - RTF_CLONING | RTF_PRCLONING); + rt2 = rtalloc1_scoped_locked(rt_key(rt), 0, + RTF_CLONING | RTF_PRCLONING, sa_get_ifscope(rt_key(rt))); if (rt2) { if (rt2->rt_flags & RTF_LLINFO && rt2->rt_flags & RTF_HOST && @@ -174,24 +177,43 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, return ret; } +/* + * Validate (unexpire) an expiring AF_INET route. + */ +struct radix_node * +in_validate(struct radix_node *rn) +{ + struct rtentry *rt = (struct rtentry *)rn; + + /* This is first reference? */ + if (rt != NULL && rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) { + rt->rt_flags &= ~RTPRF_OURS; + rt->rt_rmx.rmx_expire = 0; + } + return (rn); +} + +/* + * Similar to in_matroute_args except without the leaf-matching parameters. + */ +static struct radix_node * +in_matroute(void *v_arg, struct radix_node_head *head) +{ + return (in_matroute_args(v_arg, head, NULL, NULL)); +} + /* * This code is the inverse of in_clsroute: on first reference, if we * were managing the route, stop doing so and set the expiration timer * back off again. */ static struct radix_node * -in_matroute(void *v_arg, struct radix_node_head *head) +in_matroute_args(void *v_arg, struct radix_node_head *head, + rn_matchf_t *f, void *w) { - struct radix_node *rn = rn_match(v_arg, head); - struct rtentry *rt = (struct rtentry *)rn; + struct radix_node *rn = rn_match_args(v_arg, head, f, w); - if(rt && rt->rt_refcnt == 0) { /* this is first reference */ - if(rt->rt_flags & RTPRF_OURS) { - rt->rt_flags &= ~RTPRF_OURS; - rt->rt_rmx.rmx_expire = 0; - } - } - return rn; + return (in_validate(rn)); } static int rtq_reallyold = 60*60; @@ -430,6 +452,7 @@ in_inithead(void **head, int off) rnh = *head; rnh->rnh_addaddr = in_addroute; rnh->rnh_matchaddr = in_matroute; + rnh->rnh_matchaddr_args = in_matroute_args; rnh->rnh_close = in_clsroute; in_rtqtimo(rnh); /* kick off timeout first time */ return 1; diff --git a/bsd/netinet/in_var.h b/bsd/netinet/in_var.h index 35826b528..fe1bc4899 100644 --- a/bsd/netinet/in_var.h +++ b/bsd/netinet/in_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -94,9 +94,6 @@ struct in_ifaddr { struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */ #define ia_broadaddr ia_dstaddr struct sockaddr_in ia_sockmask; /* reserve space for general netmask */ -#if CONFIG_FORCE_OUT_IFP - struct rtentry *ia_route; /* PDP context hack - a faux route we can use */ -#endif }; #endif /* PRIVATE */ @@ -307,6 +304,7 @@ void in_delmulti(struct in_multi **); int in_control(struct socket *, u_long, caddr_t, struct ifnet *, struct proc *); void in_rtqdrain(void); +extern struct radix_node *in_validate(struct radix_node *); void ip_input(struct mbuf *); int in_ifadown(struct ifaddr *ifa, int); void in_ifscrub(struct ifnet *, struct in_ifaddr *, int); diff --git a/bsd/netinet/ip_divert.c b/bsd/netinet/ip_divert.c index ebc0772b0..f0708780d 100644 --- a/bsd/netinet/ip_divert.c +++ b/bsd/netinet/ip_divert.c @@ -351,6 +351,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, /* Reinject packet into the system as incoming or outgoing */ if (!sin || sin->sin_addr.s_addr == 0) { + struct ip_out_args ipoa = { IFSCOPE_NONE }; + /* * Don't allow both user specified and setsockopt options, * and don't allow packet length sizes that will crash @@ -377,8 +379,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr *addr, error = ip_output(m, inp->inp_options, &inp->inp_route, (so->so_options & SO_DONTROUTE) | - IP_ALLOWBROADCAST | IP_RAWOUTPUT, - inp->inp_moptions, NULL); + IP_ALLOWBROADCAST | IP_RAWOUTPUT | IP_OUTARGS, + inp->inp_moptions, &ipoa); socket_lock(so, 0); } else { struct ifaddr *ifa; diff --git a/bsd/netinet/ip_dummynet.c b/bsd/netinet/ip_dummynet.c index 426b22d5d..b146d94d6 100644 --- a/bsd/netinet/ip_dummynet.c +++ b/bsd/netinet/ip_dummynet.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -479,6 +479,7 @@ transmit_event(struct dn_pipe *pipe) (void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL); if (tmp_rt.ro_rt) { rtfree(tmp_rt.ro_rt); + tmp_rt.ro_rt = NULL; } break ; } @@ -1254,6 +1255,8 @@ dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) pkt->dn_dst = fwa->dst; pkt->flags = fwa->flags; + if (fwa->ipoa != NULL) + pkt->ipoa = *(fwa->ipoa); } if (q->head == NULL) q->head = m; @@ -1362,8 +1365,10 @@ dropit: struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \ if (tag) { \ struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \ - if (n->ro.ro_rt) \ + if (n->ro.ro_rt) { \ rtfree(n->ro.ro_rt); \ + n->ro.ro_rt = NULL; \ + } \ } \ m_tag_delete(_m, tag); \ m_freem(_m); \ diff --git a/bsd/netinet/ip_dummynet.h b/bsd/netinet/ip_dummynet.h index 312b7e266..1994be1ba 100644 --- a/bsd/netinet/ip_dummynet.h +++ b/bsd/netinet/ip_dummynet.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -148,6 +148,8 @@ struct dn_heap { * processing requirements. */ #ifdef KERNEL +#include /* for ip_out_args */ + struct dn_pkt_tag { struct ip_fw *rule; /* matching rule */ int dn_dir; /* action when packet comes out. */ @@ -160,6 +162,7 @@ struct dn_pkt_tag { struct sockaddr_in *dn_dst ; struct route ro; /* route, for ip_output. MUST COPY */ int flags ; /* flags, for ip_output (IPv6 ?) */ + struct ip_out_args ipoa; /* output args, for ip_output. MUST COPY */ }; #else struct dn_pkt; diff --git a/bsd/netinet/ip_flow.c b/bsd/netinet/ip_flow.c index d6c1f128f..4fb3f8596 100644 --- a/bsd/netinet/ip_flow.c +++ b/bsd/netinet/ip_flow.c @@ -279,6 +279,7 @@ ipflow_reap( LIST_REMOVE(ipf, ipf_next); ipflow_addstats(ipf); rtfree(ipf->ipf_ro.ro_rt); + ipf->ipf_ro.ro_rt = NULL; return ipf; } /* note: called under the ip_mutex lock */ @@ -344,6 +345,7 @@ ipflow_create( LIST_REMOVE(ipf, ipf_next); ipflow_addstats(ipf); rtfree(ipf->ipf_ro.ro_rt); + ipf->ipf_ro.ro_rt = NULL; ipf->ipf_uses = ipf->ipf_last_uses = 0; ipf->ipf_errors = ipf->ipf_dropped = 0; } diff --git a/bsd/netinet/ip_fw2.c b/bsd/netinet/ip_fw2.c index ea482f0c6..400e032b5 100644 --- a/bsd/netinet/ip_fw2.c +++ b/bsd/netinet/ip_fw2.c @@ -1309,8 +1309,10 @@ send_pkt(struct ipfw_flow_id *id, u_int32_t seq, u_int32_t ack, int flags) ip_rtaddr(ip->ip_dst, &sro); m->m_flags |= M_SKIP_FIREWALL; ip_output_list(m, 0, NULL, &sro, 0, NULL, NULL); - if (sro.ro_rt) + if (sro.ro_rt) { RTFREE(sro.ro_rt); + sro.ro_rt = NULL; + } } /* diff --git a/bsd/netinet/ip_fw2.h b/bsd/netinet/ip_fw2.h index 1e36b65a9..24ef2abe6 100644 --- a/bsd/netinet/ip_fw2.h +++ b/bsd/netinet/ip_fw2.h @@ -453,6 +453,7 @@ struct ip_fw_args { struct route *ro; /* for dummynet */ struct sockaddr_in *dst; /* for dummynet */ int flags; /* for dummynet */ + struct ip_out_args *ipoa; /* for dummynet */ struct ipfw_flow_id f_id; /* grabbed from IP header */ u_int16_t divert_rule; /* divert cookie */ diff --git a/bsd/netinet/ip_icmp.c b/bsd/netinet/ip_icmp.c index 3ed8a2d45..995ca8346 100644 --- a/bsd/netinet/ip_icmp.c +++ b/bsd/netinet/ip_icmp.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -641,10 +641,9 @@ reflect: } #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; - rtredirect((struct sockaddr *)&icmpsrc, - (struct sockaddr *)&icmpdst, - (struct sockaddr *)0, RTF_GATEWAY | RTF_HOST, - (struct sockaddr *)&icmpgw, (struct rtentry **)0); + rtredirect(m->m_pkthdr.rcvif, (struct sockaddr *)&icmpsrc, + (struct sockaddr *)&icmpdst, NULL, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&icmpgw, NULL); pfctlinput(PRC_REDIRECT_HOST, (struct sockaddr *)&icmpsrc); #if IPSEC key_sa_routechange((struct sockaddr *)&icmpsrc); @@ -826,6 +825,10 @@ icmp_send(struct mbuf *m, struct mbuf *opts) int hlen; struct icmp *icp; struct route ro; + struct ip_out_args ipoa = { IFSCOPE_NONE }; + + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ipoa.ipoa_ifscope = m->m_pkthdr.rcvif->if_index; hlen = IP_VHL_HL(ip->ip_vhl) << 2; m->m_data += hlen; @@ -849,9 +852,11 @@ icmp_send(struct mbuf *m, struct mbuf *opts) } #endif bzero(&ro, sizeof ro); - (void) ip_output(m, opts, &ro, 0, NULL, NULL); - if (ro.ro_rt) + (void) ip_output(m, opts, &ro, IP_OUTARGS, NULL, &ipoa); + if (ro.ro_rt) { rtfree(ro.ro_rt); + ro.ro_rt = NULL; + } } n_time @@ -1075,6 +1080,10 @@ icmp_dgram_ctloutput(struct socket *so, struct sockopt *sopt) #endif case IP_STRIPHDR: case IP_RECVTTL: + case IP_BOUND_IF: +#if CONFIG_FORCE_OUT_IFP + case IP_FORCE_OUT_IFP: +#endif error = rip_ctloutput(so, sopt); break; diff --git a/bsd/netinet/ip_input.c b/bsd/netinet/ip_input.c index 8743d9178..7c603ad9f 100644 --- a/bsd/netinet/ip_input.c +++ b/bsd/netinet/ip_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -189,6 +189,14 @@ SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW, static int currentfrags = 0; +#if CONFIG_SCOPEDROUTING +int ip_doscopedroute = 1; +#else +int ip_doscopedroute = 0; +#endif +SYSCTL_INT(_net_inet_ip, OID_AUTO, scopedroute, CTLFLAG_RW, + &ip_doscopedroute, 0, "Enable IPv4 scoped routing"); + /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table @@ -586,12 +594,14 @@ ip_input(struct mbuf *m) panic("ip_input no HDR"); #endif +#if DUMMYNET if (args.rule) { /* dummynet already filtered us */ ip = mtod(m, struct ip *); hlen = IP_VHL_HL(ip->ip_vhl) << 2; inject_filter_ref = ipf_get_inject_filter(m); goto iphack ; } +#endif /* DUMMYNET */ #endif /* IPFIREWALL */ /* @@ -2080,13 +2090,10 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop, struct route n_long dest; struct in_addr pkt_dst; struct ifnet *destifp; - struct ifnet *rcvif = m->m_pkthdr.rcvif; #if IPSEC struct ifnet dummyifp; #endif - m->m_pkthdr.rcvif = NULL; - dest = 0; /* * Cache the destination address of the packet; this may be diff --git a/bsd/netinet/ip_output.c b/bsd/netinet/ip_output.c index db39fe174..047b6b7ce 100644 --- a/bsd/netinet/ip_output.c +++ b/bsd/netinet/ip_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -81,6 +81,7 @@ #include #include +#include #include #include @@ -145,6 +146,8 @@ static int ip_pcbopts(int, struct mbuf **, struct mbuf *); static int ip_setmoptions(struct sockopt *, struct ip_moptions **); static void ip_out_cksum_stats(int, u_int32_t); +static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int); +static void ip_bindif(struct inpcb *, unsigned int); int ip_createmoptions(struct ip_moptions **imop); int ip_addmembership(struct ip_moptions *imo, struct ip_mreq *mreq); @@ -175,6 +178,11 @@ static int forge_ce = 0; SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce, CTLFLAG_RW, &forge_ce, 0, "Forge ECN CE"); #endif /* DEBUG */ + +static int ip_select_srcif_debug = 0; +SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug, CTLFLAG_RW, + &ip_select_srcif_debug, 0, "log source interface selection debug info"); + /* * IP output. The packet in mbuf chain m contains a skeletal IP * header (with len, off, ttl, proto, tos, src, dst). @@ -188,10 +196,10 @@ ip_output( struct route *ro, int flags, struct ip_moptions *imo, - struct ifnet *ifp) + struct ip_out_args *ipoa) { int error; - error = ip_output_list(m0, 0, opt, ro, flags, imo, ifp); + error = ip_output_list(m0, 0, opt, ro, flags, imo, ipoa); return error; } @@ -225,11 +233,7 @@ ip_output_list( struct route *ro, int flags, struct ip_moptions *imo, -#if CONFIG_FORCE_OUT_IFP - struct ifnet *pdp_ifp -#else - __unused struct ifnet *unused_ifp -#endif + struct ip_out_args *ipoa ) { struct ip *ip, *mhip; @@ -256,9 +260,11 @@ ip_output_list( ipfilter_t inject_filter_ref = 0; struct m_tag *tag; struct route saved_route; + struct ip_out_args saved_ipoa; struct mbuf * packetlist; int pktcnt = 0; - + unsigned int ifscope; + boolean_t select_srcif; KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); @@ -268,6 +274,7 @@ ip_output_list( args.eh = NULL; args.rule = NULL; args.divert_rule = 0; /* divert cookie */ + args.ipoa = NULL; /* Grab info from mtags prepended to the chain */ #if DUMMYNET @@ -284,6 +291,8 @@ ip_output_list( dst = dn_tag->dn_dst; ifp = dn_tag->ifp; flags = dn_tag->flags; + saved_ipoa = dn_tag->ipoa; + ipoa = &saved_ipoa; m_tag_delete(m0, tag); } @@ -320,6 +329,20 @@ ip_output_list( mtod(m, struct ip *)->ip_p); #endif + /* + * Do not perform source interface selection when forwarding. + * At present the IP_OUTARGS flag implies a request for IP to + * perform source interface selection. + */ + if (ip_doscopedroute && + (flags & (IP_OUTARGS | IP_FORWARDING)) == IP_OUTARGS) { + select_srcif = TRUE; + ifscope = ipoa->ipoa_ifscope; + } else { + select_srcif = FALSE; + ifscope = IFSCOPE_NONE; + } + #if IPFIREWALL if (args.rule != NULL) { /* dummynet already saw us */ ip = mtod(m, struct ip *); @@ -419,7 +442,13 @@ loopit: rtfree_locked(ro->ro_rt); ro->ro_rt = NULL; } - if (ro->ro_rt && ro->ro_rt->generation_id != route_generation) + /* + * If we're doing source interface selection, we may not + * want to use this route; only synch up the generation + * count otherwise. + */ + if (!select_srcif && ro->ro_rt != NULL && + ro->ro_rt->generation_id != route_generation) ro->ro_rt->generation_id = route_generation; } if (ro->ro_rt == NULL) { @@ -448,22 +477,81 @@ loopit: ifp = ia->ia_ifp; ip->ip_ttl = 1; isbroadcast = in_broadcast(dst->sin_addr, ifp); + } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && + imo != NULL && imo->imo_multicast_ifp != NULL) { + /* + * Bypass the normal routing lookup for multicast + * packets if the interface is specified. + */ + ifp = imo->imo_multicast_ifp; + isbroadcast = 0; + if (ia != NULL) + ifafree(&ia->ia_ifa); + + /* Could use IFP_TO_IA instead but rt_mtx is already held */ + for (ia = TAILQ_FIRST(&in_ifaddrhead); + ia != NULL && ia->ia_ifp != ifp; + ia = TAILQ_NEXT(ia, ia_link)) + continue; + + if (ia != NULL) + ifaref(&ia->ia_ifa); } else { + boolean_t cloneok = FALSE; + /* + * Perform source interface selection; the source IP address + * must belong to one of the addresses of the interface used + * by the route. For performance reasons, do this only if + * there is no route, or if the routing table has changed, + * or if we haven't done source interface selection on this + * route (for this PCB instance) before. + */ + if (select_srcif && ip->ip_src.s_addr != INADDR_ANY && + (ro->ro_rt == NULL || + ro->ro_rt->generation_id != route_generation || + !(ro->ro_flags & ROF_SRCIF_SELECTED))) { + struct ifaddr *ifa; -#if CONFIG_FORCE_OUT_IFP - /* Check if this packet should be forced out a specific interface */ - if (ro->ro_rt == 0 && pdp_ifp != NULL) { - pdp_context_route_locked(pdp_ifp, ro); - - if (ro->ro_rt == NULL) { - OSAddAtomic(1, (UInt32*)&ipstat.ips_noroute); - error = EHOSTUNREACH; + /* Find the source interface */ + ifa = in_selectsrcif(ip, ro, ifscope); + + /* + * If the source address is spoofed (in the case + * of IP_RAWOUTPUT), or if this is destined for + * local/loopback, just let it go out using the + * interface of the route. Otherwise, there's no + * interface having such an address, so bail out. + */ + if (ifa == NULL && !(flags & IP_RAWOUTPUT) && + ifscope != lo_ifp->if_index) { + error = EADDRNOTAVAIL; lck_mtx_unlock(rt_mtx); goto bad; } + + /* + * If the caller didn't explicitly specify the scope, + * pick it up from the source interface. If the cached + * route was wrong and was blown away as part of source + * interface selection, don't mask out RTF_PRCLONING + * since that route may have been allocated by the ULP, + * unless the IP header was created by the caller or + * the destination is IPv4 LLA. The check for the + * latter is needed because IPv4 LLAs are never scoped + * in the current implementation, and we don't want to + * replace the resolved IPv4 LLA route with one whose + * gateway points to that of the default gateway on + * the primary interface of the system. + */ + if (ifa != NULL) { + if (ifscope == IFSCOPE_NONE) + ifscope = ifa->ifa_ifp->if_index; + ifafree(ifa); + cloneok = (!(flags & IP_RAWOUTPUT) && + !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)))); + } } -#endif - + /* * If this is the case, we probably don't want to allocate * a protocol-cloned route since we didn't get one from the @@ -473,8 +561,7 @@ loopit: * the link layer, as this is probably required in all cases * for correct operation (as it is for ARP). */ - - if (ro->ro_rt == 0) { + if (ro->ro_rt == NULL) { unsigned long ign = RTF_PRCLONING; /* * We make an exception here: if the destination @@ -487,23 +574,26 @@ loopit: * that allocate a route and those that don't. The * RTF_BROADCAST route is important since we'd want * to send out undirected IP broadcast packets using - * link-level broadcast address. + * link-level broadcast address. Another exception + * is for ULP-created routes that got blown away by + * source interface selection (see above). * - * This exception will no longer be necessary when + * These exceptions will no longer be necessary when * the RTF_PRCLONING scheme is no longer present. */ - if (dst->sin_addr.s_addr == INADDR_BROADCAST) + if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) ign &= ~RTF_PRCLONING; - rtalloc_ign_locked(ro, ign); + rtalloc_scoped_ign_locked(ro, ign, ifscope); } - if (ro->ro_rt == 0) { + + if (ro->ro_rt == NULL) { OSAddAtomic(1, (SInt32*)&ipstat.ips_noroute); error = EHOSTUNREACH; lck_mtx_unlock(rt_mtx); goto bad; } - + if (ia) ifafree(&ia->ia_ifa); ia = ifatoia(ro->ro_rt->rt_ifa); @@ -1025,22 +1115,24 @@ skip_ipsec: } #if DUMMYNET if (DUMMYNET_LOADED && (off & IP_FW_PORT_DYNT_FLAG) != 0) { - /* - * pass the pkt to dummynet. Need to include - * pipe number, m, ifp, ro, dst because these are - * not recomputed in the next pass. - * All other parameters have been already used and - * so they are not needed anymore. - * XXX note: if the ifp or ro entry are deleted - * while a pkt is in dummynet, we are in trouble! - */ - args.ro = ro; - args.dst = dst; - args.flags = flags; - - error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, - &args); - goto done; + /* + * pass the pkt to dummynet. Need to include + * pipe number, m, ifp, ro, dst because these are + * not recomputed in the next pass. + * All other parameters have been already used and + * so they are not needed anymore. + * XXX note: if the ifp or ro entry are deleted + * while a pkt is in dummynet, we are in trouble! + */ + args.ro = ro; + args.dst = dst; + args.flags = flags; + if (flags & IP_OUTARGS) + args.ipoa = ipoa; + + error = ip_dn_io_ptr(m, off & 0xffff, DN_TO_IP_OUT, + &args); + goto done; } #endif /* DUMMYNET */ #if IPDIVERT @@ -1941,58 +2033,66 @@ ip_ctloutput(so, sopt) break; #undef OPTSET -#if CONFIG_FORCE_OUT_IFP +#if CONFIG_FORCE_OUT_IFP + /* + * Apple private interface, similar to IP_BOUND_IF, except + * that the parameter is a NULL-terminated string containing + * the name of the network interface; an emptry string means + * unbind. Applications are encouraged to use IP_BOUND_IF + * instead, as that is the current "official" API. + */ case IP_FORCE_OUT_IFP: { - char ifname[IFNAMSIZ]; - ifnet_t ifp; - + char ifname[IFNAMSIZ]; + unsigned int ifscope; + + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + /* Verify interface name parameter is sane */ if (sopt->sopt_valsize > sizeof(ifname)) { error = EINVAL; break; } - + /* Copy the interface name */ if (sopt->sopt_valsize != 0) { - error = sooptcopyin(sopt, ifname, sizeof(ifname), sopt->sopt_valsize); + error = sooptcopyin(sopt, ifname, + sizeof (ifname), sopt->sopt_valsize); if (error) break; } - - if (sopt->sopt_valsize == 0 || ifname[0] == 0) { - // Set pdp_ifp to NULL - inp->pdp_ifp = NULL; - - // Flush the route - if (inp->inp_route.ro_rt) { - rtfree(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = NULL; + + if (sopt->sopt_valsize == 0 || ifname[0] == NULL) { + /* Unbind this socket from any interface */ + ifscope = IFSCOPE_NONE; + } else { + ifnet_t ifp; + + /* Verify name is NULL terminated */ + if (ifname[sopt->sopt_valsize - 1] != NULL) { + error = EINVAL; + break; } - - break; - } - - /* Verify name is NULL terminated */ - if (ifname[sopt->sopt_valsize - 1] != 0) { - error = EINVAL; - break; - } - - if (ifnet_find_by_name(ifname, &ifp) != 0) { - error = ENXIO; - break; - } - - /* Won't actually free. Since we don't release this later, we should do it now. */ - ifnet_release(ifp); - - /* This only works for point-to-point interfaces */ - if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { - error = ENOTSUP; - break; + + /* Bail out if given bogus interface name */ + if (ifnet_find_by_name(ifname, &ifp) != 0) { + error = ENXIO; + break; + } + + /* Bind this socket to this interface */ + ifscope = ifp->if_index; + + /* + * Won't actually free; since we don't release + * this later, we should do it now. + */ + ifnet_release(ifp); } - - inp->pdp_ifp = ifp; + ip_bindif(inp, ifscope); } break; #endif @@ -2080,6 +2180,40 @@ ip_ctloutput(so, sopt) } #endif /* TRAFFIC_MGT */ + /* + * On a multihomed system, scoped routing can be used to + * restrict the source interface used for sending packets. + * The socket option IP_BOUND_IF binds a particular AF_INET + * socket to an interface such that data sent on the socket + * is restricted to that interface. This is unlike the + * SO_DONTROUTE option where the routing table is bypassed; + * therefore it allows for a greater flexibility and control + * over the system behavior, and does not place any restriction + * on the destination address type (e.g. unicast, multicast, + * or broadcast if applicable) or whether or not the host is + * directly reachable. Note that in the multicast transmit + * case, IP_MULTICAST_IF takes precedence over IP_BOUND_IF, + * since the former practically bypasses the routing table; + * in this case, IP_BOUND_IF sets the default interface used + * for sending multicast packets in the absence of an explicit + * transmit interface set via IP_MULTICAST_IF. + */ + case IP_BOUND_IF: + /* This option is settable only for IPv4 */ + if (!(inp->inp_vflag & INP_IPV4)) { + error = EINVAL; + break; + } + + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + + if (error) + break; + + ip_bindif(inp, optval); + break; + default: error = ENOPROTOOPT; break; @@ -2198,6 +2332,12 @@ ip_ctloutput(so, sopt) } #endif /* TRAFFIC_MGT */ + case IP_BOUND_IF: + if (inp->inp_flags & INP_BOUND_IF) + optval = inp->inp_boundif; + error = sooptcopyout(sopt, &optval, sizeof (optval)); + break; + default: error = ENOPROTOOPT; break; @@ -2870,3 +3010,189 @@ ip_mloopback(ifp, m, dst, hlen) m_freem(copym); } } + +/* + * Given a source IP address (and route, if available), determine the best + * interface to send the packet from. + */ +static struct ifaddr * +in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope) +{ + struct ifaddr *ifa = NULL; + struct sockaddr src = { sizeof (struct sockaddr_in), AF_INET, { 0, } }; + struct ifnet *rt_ifp; + char ip_src[16], ip_dst[16]; + + if (ip_select_srcif_debug) { + (void) inet_ntop(AF_INET, &ip->ip_src.s_addr, ip_src, + sizeof (ip_src)); + (void) inet_ntop(AF_INET, &ip->ip_dst.s_addr, ip_dst, + sizeof (ip_dst)); + } + + lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); + + ((struct sockaddr_in *)&src)->sin_addr.s_addr = ip->ip_src.s_addr; + rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL; + + /* + * Given the source IP address, find a suitable source interface + * to use for transmission; if the caller has specified a scope, + * optimize the search by looking at the addresses only for that + * interface. This is still suboptimal, however, as we need to + * traverse the per-interface list. + */ + if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) { + unsigned int scope = ifscope; + + /* + * If no scope is specified and the route is stale (pointing + * to a defunct interface) use the current primary interface; + * this happens when switching between interfaces configured + * with the same IP address. Otherwise pick up the scope + * information from the route; the ULP may have looked up a + * correct route and we just need to verify it here and mark + * it with the ROF_SRCIF_SELECTED flag below. + */ + if (scope == IFSCOPE_NONE) { + scope = rt_ifp->if_index; + if (scope != get_primary_ifscope() && + ro->ro_rt->generation_id != route_generation) + scope = get_primary_ifscope(); + } + + ifa = ifa_ifwithaddr_scoped(&src, scope); + + if (ip_select_srcif_debug && ifa != NULL) { + if (ro->ro_rt != NULL) { + printf("%s->%s ifscope %d->%d ifa_if %s%d " + "ro_if %s%d\n", ip_src, ip_dst, ifscope, + scope, ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit, rt_ifp->if_name, + rt_ifp->if_unit); + } else { + printf("%s->%s ifscope %d->%d ifa_if %s%d\n", + ip_src, ip_dst, ifscope, scope, + ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit); + } + } + } + + /* + * Slow path; search for an interface having the corresponding source + * IP address if the scope was not specified by the caller, and: + * + * 1) There currently isn't any route, or, + * 2) The interface used by the route does not own that source + * IP address; in this case, the route will get blown away + * and we'll do a more specific scoped search using the newly + * found interface. + */ + if (ifa == NULL && ifscope == IFSCOPE_NONE) { + ifa = ifa_ifwithaddr(&src); + + if (ip_select_srcif_debug && ifa != NULL) { + printf("%s->%s ifscope %d ifa_if %s%d\n", + ip_src, ip_dst, ifscope, ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit); + } + } + + /* + * If there is a non-loopback route with the wrong interface, or if + * there is no interface configured with such an address, blow it + * away. Except for local/loopback, we look for one with a matching + * interface scope/index. + */ + if (ro->ro_rt != NULL && + (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) || + !(ro->ro_rt->rt_flags & RTF_UP))) { + if (ip_select_srcif_debug) { + if (ifa != NULL) { + printf("%s->%s ifscope %d ro_if %s%d != " + "ifa_if %s%d (cached route cleared)\n", + ip_src, ip_dst, ifscope, rt_ifp->if_name, + rt_ifp->if_unit, ifa->ifa_ifp->if_name, + ifa->ifa_ifp->if_unit); + } else { + printf("%s->%s ifscope %d ro_if %s%d " + "(no ifa_if found)\n", + ip_src, ip_dst, ifscope, rt_ifp->if_name, + rt_ifp->if_unit); + } + } + + rtfree_locked(ro->ro_rt); + ro->ro_rt = NULL; + ro->ro_flags &= ~ROF_SRCIF_SELECTED; + + /* + * If the destination is IPv4 LLA and the route's interface + * doesn't match the source interface, then the source IP + * address is wrong; it most likely belongs to the primary + * interface associated with the IPv4 LL subnet. Drop the + * packet rather than letting it go out and return an error + * to the ULP. This actually applies not only to IPv4 LL + * but other shared subnets; for now we explicitly test only + * for the former case and save the latter for future. + */ + if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) && + !IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) && ifa != NULL) { + ifafree(ifa); + ifa = NULL; + } + } + + if (ip_select_srcif_debug && ifa == NULL) { + printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n", + ip_src, ip_dst, ifscope); + } + + /* + * If there is a route, mark it accordingly. If there isn't one, + * we'll get here again during the next transmit (possibly with a + * route) and the flag will get set at that point. For IPv4 LLA + * destination, mark it only if the route has been fully resolved; + * otherwise we want to come back here again when the route points + * to the interface over which the ARP reply arrives on. + */ + if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) || + (ro->ro_rt->rt_gateway->sa_family == AF_LINK && + SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) { + ro->ro_flags |= ROF_SRCIF_SELECTED; + ro->ro_rt->generation_id = route_generation; + } + + return (ifa); +} + +/* + * Handler for setting IP_FORCE_OUT_IFP or IP_BOUND_IF socket option. + */ +static void +ip_bindif(struct inpcb *inp, unsigned int ifscope) +{ + /* + * A zero interface scope value indicates an "unbind". + * Otherwise, take in whatever value the app desires; + * the app may already know the scope (or force itself + * to such a scope) ahead of time before the interface + * gets attached. It doesn't matter either way; any + * route lookup from this point on will require an + * exact match for the embedded interface scope. + */ + inp->inp_boundif = ifscope; + if (inp->inp_boundif == IFSCOPE_NONE) + inp->inp_flags &= ~INP_BOUND_IF; + else + inp->inp_flags |= INP_BOUND_IF; + + lck_mtx_lock(rt_mtx); + /* Blow away any cached route in the PCB */ + if (inp->inp_route.ro_rt != NULL) { + rtfree_locked(inp->inp_route.ro_rt); + inp->inp_route.ro_rt = NULL; + } + lck_mtx_unlock(rt_mtx); +} diff --git a/bsd/netinet/ip_var.h b/bsd/netinet/ip_var.h index 81eb6f135..0861cf587 100644 --- a/bsd/netinet/ip_var.h +++ b/bsd/netinet/ip_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -195,12 +195,20 @@ struct ip_linklocal_stat { #define IP_NOIPSEC 0x4 /* No IPSec processing */ #define IP_ROUTETOIF SO_DONTROUTE /* bypass routing tables (0x0010) */ #define IP_ALLOWBROADCAST SO_BROADCAST /* can send broadcast packets (0x0020) */ +#define IP_OUTARGS 0x100 /* has ancillary output info */ struct ip; struct inpcb; struct route; struct sockopt; +/* + * Extra information passed to ip_output when IP_OUTARGS is set. + */ +struct ip_out_args { + unsigned int ipoa_ifscope; /* interface scope */ +}; + extern struct ipstat ipstat; #if !defined(RANDOM_IP_ID) || RANDOM_IP_ID == 0 extern u_short ip_id; /* ip packet ctr, for ids */ @@ -214,6 +222,7 @@ extern int (*legal_vif_num)(int); extern u_long (*ip_mcast_src)(int); extern int rsvp_on; extern struct pr_usrreqs rip_usrreqs; +extern int ip_doscopedroute; int ip_ctloutput(struct socket *, struct sockopt *sopt); void ip_drain(void); @@ -221,10 +230,10 @@ void ip_freemoptions(struct ip_moptions *); void ip_init(void) __attribute__((section("__TEXT, initcode"))); extern int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *, struct ip_moptions *); -int ip_output(struct mbuf *, - struct mbuf *, struct route *, int, struct ip_moptions *, struct ifnet *); -int ip_output_list(struct mbuf *, int, - struct mbuf *, struct route *, int, struct ip_moptions *, struct ifnet *); +extern int ip_output(struct mbuf *, struct mbuf *, struct route *, int, + struct ip_moptions *, struct ip_out_args *); +extern int ip_output_list(struct mbuf *, int, struct mbuf *, struct route *, + int, struct ip_moptions *, struct ip_out_args *); struct in_ifaddr * ip_rtaddr(struct in_addr, struct route *); void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, diff --git a/bsd/netinet/kpi_ipfilter.c b/bsd/netinet/kpi_ipfilter.c index 3ade07635..11e005de8 100644 --- a/bsd/netinet/kpi_ipfilter.c +++ b/bsd/netinet/kpi_ipfilter.c @@ -316,8 +316,10 @@ ipf_injectv4_out( error = ip_output(m, NULL, &ro, IP_ALLOWBROADCAST | IP_RAWOUTPUT, imo, NULL); /* Release the route */ - if (ro.ro_rt) + if (ro.ro_rt) { rtfree(ro.ro_rt); + ro.ro_rt = NULL; + } return error; } @@ -390,8 +392,10 @@ ipf_injectv6_out( error = ip6_output(m, NULL, &ro, 0, im6o, NULL, 0); /* Release the route */ - if (ro.ro_rt) + if (ro.ro_rt) { rtfree(ro.ro_rt); + ro.ro_rt = NULL; + } return error; } diff --git a/bsd/netinet/raw_ip.c b/bsd/netinet/raw_ip.c index 533184f4e..0cc25b616 100644 --- a/bsd/netinet/raw_ip.c +++ b/bsd/netinet/raw_ip.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -320,6 +320,12 @@ rip_output(m, so, dst) register struct ip *ip; register struct inpcb *inp = sotoinpcb(so); int flags = (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST; + struct ip_out_args ipoa; + + /* If socket was bound to an ifindex, tell ip_output about it */ + ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + flags |= IP_OUTARGS; /* * If the user handed us a complete IP packet, use it. @@ -384,14 +390,8 @@ rip_output(m, so, dst) #if CONFIG_IP_EDGEHOLE ip_edgehole_mbuf_tag(inp, m); #endif - -#if CONFIG_FORCE_OUT_IFP - return (ip_output_list(m, 0, inp->inp_options, &inp->inp_route, flags, - inp->inp_moptions, inp->pdp_ifp)); -#else - return (ip_output_list(m, 0, inp->inp_options, &inp->inp_route, flags, - inp->inp_moptions, NULL)); -#endif + return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + inp->inp_moptions, &ipoa)); } #if IPFIREWALL diff --git a/bsd/netinet/tcp_input.c b/bsd/netinet/tcp_input.c index 36756785d..138bcb3c7 100644 --- a/bsd/netinet/tcp_input.c +++ b/bsd/netinet/tcp_input.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -210,7 +210,7 @@ struct inpcbhead tcb; struct inpcbinfo tcbinfo; static void tcp_dooptions(struct tcpcb *, - u_char *, int, struct tcphdr *, struct tcpopt *); + u_char *, int, struct tcphdr *, struct tcpopt *, unsigned int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -552,6 +552,19 @@ tcp_input(m, off0) #endif struct m_tag *fwd_tag; u_char ip_ecn = IPTOS_ECN_NOTECT; + unsigned int ifscope; + + /* + * Record the interface where this segment arrived on; this does not + * affect normal data output (for non-detached TCP) as it provides a + * hint about which route and interface to use for sending in the + * absence of a PCB, when scoped routing (and thus source interface + * selection) are enabled. + */ + if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.rcvif != NULL) + ifscope = m->m_pkthdr.rcvif->if_index; + else + ifscope = IFSCOPE_NONE; /* Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ fwd_tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_IPFORWARD, NULL); @@ -821,6 +834,14 @@ findpcb: ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif); } + /* + * Use the interface scope information from the PCB for outbound + * segments. If the PCB isn't present and if scoped routing is + * enabled, tcp_respond will use the scope of the interface where + * the segment arrived on. + */ + if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) + ifscope = inp->inp_boundif; #if IPSEC if (ipsec_bypass == 0) { #if INET6 @@ -981,6 +1002,11 @@ findpcb: struct inpcb *oinp = sotoinpcb(so); #endif /* INET6 */ int ogencnt = so->so_gencnt; + unsigned int head_ifscope; + + /* Get listener's bound-to-interface, if any */ + head_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; #if !IPSEC /* @@ -1107,6 +1133,21 @@ findpcb: */ dropsocket++; inp = (struct inpcb *)so->so_pcb; + + /* + * Inherit INP_BOUND_IF from listener; testing if + * head_ifscope is non-zero is sufficient, since it + * can only be set to a non-zero value earlier if + * the listener has such a flag set. + */ +#if INET6 + if (head_ifscope != IFSCOPE_NONE && !isipv6) { +#else + if (head_ifscope != IFSCOPE_NONE) { +#endif /* INET6 */ + inp->inp_flags |= INP_BOUND_IF; + inp->inp_boundif = head_ifscope; + } #if INET6 if (isipv6) inp->in6p_laddr = ip6->ip6_dst; @@ -1344,7 +1385,7 @@ findpcb: * else do it below (after getting remote address). */ if (tp->t_state != TCPS_LISTEN && optp) - tcp_dooptions(tp, optp, optlen, th, &to); + tcp_dooptions(tp, optp, optlen, th, &to, ifscope); if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if (to.to_flags & TOF_SCALE) { @@ -1359,7 +1400,7 @@ findpcb: tp->ts_recent_age = tcp_now; } if (to.to_flags & TOF_MSS) - tcp_mss(tp, to.to_mss); + tcp_mss(tp, to.to_mss, ifscope); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) tp->sack_enable = 0; @@ -1406,6 +1447,11 @@ findpcb: tp->ts_recent = to.to_tsval; } + /* Force acknowledgment if we received a FIN */ + + if (thflags & TH_FIN) + tp->t_flags |= TF_ACKNOW; + if (tlen == 0) { if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && @@ -1700,7 +1746,7 @@ findpcb: FREE(sin, M_SONAME); } - tcp_dooptions(tp, optp, optlen, th, &to); + tcp_dooptions(tp, optp, optlen, th, &to, ifscope); if (tp->sack_enable) { if (!(to.to_flags & TOF_SACK)) @@ -2667,8 +2713,9 @@ process_ACK: soisdisconnected(so); } tp->t_state = TCPS_FIN_WAIT_2; - goto drop; + /* fall through and make sure we also recognize data ACKed with the FIN */ } + tp->t_flags |= TF_ACKNOW; break; /* @@ -2691,6 +2738,7 @@ process_ACK: add_to_time_wait(tp); soisdisconnected(so); } + tp->t_flags |= TF_ACKNOW; break; /* @@ -2811,7 +2859,7 @@ dodata: /* XXX */ * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ - if ((tlen || (thflags&TH_FIN)) && + if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; tcp_seq save_end = th->th_seq + tlen; @@ -3056,13 +3104,13 @@ dropwithreset: if (thflags & TH_ACK) /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, - TH_RST, m->m_pkthdr.rcvif); + TH_RST, ifscope); else { if (thflags & TH_SYN) tlen++; /* mtod() below is safe as long as hdr dropping is delayed */ tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, - (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.rcvif); + (tcp_seq)0, TH_RST|TH_ACK, ifscope); } /* destroy temporarily created socket */ if (dropsocket) { @@ -3099,7 +3147,7 @@ drop: } static void -tcp_dooptions(tp, cp, cnt, th, to) +tcp_dooptions(tp, cp, cnt, th, to, input_ifscope) /* * Parse TCP options and place in tcpopt. */ @@ -3108,6 +3156,7 @@ tcp_dooptions(tp, cp, cnt, th, to) int cnt; struct tcphdr *th; struct tcpopt *to; + unsigned int input_ifscope; { u_short mss = 0; int opt, optlen; @@ -3187,7 +3236,7 @@ tcp_dooptions(tp, cp, cnt, th, to) } } if (th->th_flags & TH_SYN) - tcp_mss(tp, mss); /* sets t_maxseg */ + tcp_mss(tp, mss, input_ifscope); /* sets t_maxseg */ } /* @@ -3361,9 +3410,10 @@ tcp_maxmtu6(struct rtentry *rt) * */ void -tcp_mss(tp, offer) +tcp_mss(tp, offer, input_ifscope) struct tcpcb *tp; int offer; + unsigned int input_ifscope; { register struct rtentry *rt; struct ifnet *ifp; @@ -3398,7 +3448,7 @@ tcp_mss(tp, offer) else #endif /* INET6 */ { - rt = tcp_rtlookup(inp); + rt = tcp_rtlookup(inp, input_ifscope); if (rt && (rt->rt_gateway->sa_family == AF_LINK || rt->rt_ifp->if_flags & IFF_LOOPBACK)) isnetlocal = TRUE; @@ -3620,7 +3670,7 @@ tcp_mssopt(tp) rt = tcp_rtlookup6(tp->t_inpcb); else #endif /* INET6 */ - rt = tcp_rtlookup(tp->t_inpcb); + rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE); if (rt == NULL) { lck_mtx_unlock(rt_mtx); return ( diff --git a/bsd/netinet/tcp_output.c b/bsd/netinet/tcp_output.c index e22e04993..af6873478 100644 --- a/bsd/netinet/tcp_output.c +++ b/bsd/netinet/tcp_output.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1638,6 +1638,13 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, int error = 0; boolean_t chain; boolean_t unlocked = FALSE; + struct inpcb *inp = tp->t_inpcb; + struct ip_out_args ipoa; + + /* If socket was bound to an ifindex, tell ip_output about it */ + ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + flags |= IP_OUTARGS; /* Make sure ACK/DELACK conditions are cleared before * we unlock the socket. @@ -1691,13 +1698,8 @@ tcp_ip_output(struct socket *so, struct tcpcb *tp, struct mbuf *pkt, */ cnt = 0; } -#if CONFIG_FORCE_OUT_IFP - error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route, - flags, 0, tp->t_inpcb->pdp_ifp); -#else - error = ip_output_list(pkt, cnt, opt, &tp->t_inpcb->inp_route, - flags, 0, NULL); -#endif + error = ip_output_list(pkt, cnt, opt, &inp->inp_route, + flags, 0, &ipoa); if (chain || error) { /* * If we sent down a chain then we are done since diff --git a/bsd/netinet/tcp_subr.c b/bsd/netinet/tcp_subr.c index a94f8ad2a..f0d78d7b8 100644 --- a/bsd/netinet/tcp_subr.c +++ b/bsd/netinet/tcp_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -477,11 +477,7 @@ tcp_respond( tcp_seq ack, tcp_seq seq, int flags, -#if CONFIG_FORCE_OUT_IFP - ifnet_t ifp -#else - __unused ifnet_t ifp -#endif + unsigned int ifscope ) { register int tlen; @@ -496,7 +492,6 @@ tcp_respond( struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ - int ipflags = 0; #if INET6 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6; @@ -654,7 +649,7 @@ tcp_respond( #endif #if INET6 if (isipv6) { - (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL, 0); + (void)ip6_output(m, NULL, ro6, 0, NULL, NULL, 0); if (ro6 == &sro6 && ro6->ro_rt) { rtfree(ro6->ro_rt); ro6->ro_rt = NULL; @@ -662,11 +657,10 @@ tcp_respond( } else #endif /* INET6 */ { -#if CONFIG_FORCE_OUT_IFP - ifp = (tp && tp->t_inpcb) ? tp->t_inpcb->pdp_ifp : - (ifp && (ifp->if_flags & IFF_POINTOPOINT) != 0) ? ifp : NULL; -#endif - (void) ip_output_list(m, 0, NULL, ro, ipflags, NULL, ifp); + struct ip_out_args ipoa = { ifscope }; + + (void) ip_output(m, NULL, ro, IP_OUTARGS, NULL, &ipoa); + if (ro == &sro && ro->ro_rt) { rtfree(ro->ro_rt); ro->ro_rt = NULL; @@ -1561,7 +1555,7 @@ tcp_mtudisc( rt = tcp_rtlookup6(inp); else #endif /* INET6 */ - rt = tcp_rtlookup(inp); + rt = tcp_rtlookup(inp, IFSCOPE_NONE); if (!rt || !rt->rt_rmx.rmx_mtu) { tp->t_maxopd = tp->t_maxseg = #if INET6 @@ -1631,8 +1625,9 @@ tcp_mtudisc( * to get the interface MTU. */ struct rtentry * -tcp_rtlookup(inp) +tcp_rtlookup(inp, input_ifscope) struct inpcb *inp; + unsigned int input_ifscope; { struct route *ro; struct rtentry *rt; @@ -1648,11 +1643,24 @@ tcp_rtlookup(inp) if (rt == NULL || !(rt->rt_flags & RTF_UP) || rt->generation_id != route_generation) { /* No route yet, so try to acquire one */ if (inp->inp_faddr.s_addr != INADDR_ANY) { + unsigned int ifscope; + ro->ro_dst.sa_family = AF_INET; ro->ro_dst.sa_len = sizeof(struct sockaddr_in); ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = inp->inp_faddr; - rtalloc_ign_locked(ro, 0UL); + + /* + * If the socket was bound to an interface, then + * the bound-to-interface takes precedence over + * the inbound interface passed in by the caller + * (if we get here as part of the output path then + * input_ifscope is IFSCOPE_NONE). + */ + ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : input_ifscope; + + rtalloc_scoped_ign_locked(ro, 0UL, ifscope); rt = ro->ro_rt; } } @@ -1807,7 +1815,7 @@ tcp_gettaocache(inp) rt = tcp_rtlookup6(inp); else #endif /* INET6 */ - rt = tcp_rtlookup(inp); + rt = tcp_rtlookup(inp, IFSCOPE_NONE); /* Make sure this is a host route and is up. */ if (rt == NULL || diff --git a/bsd/netinet/tcp_timer.c b/bsd/netinet/tcp_timer.c index e8de99b71..9ad7badac 100644 --- a/bsd/netinet/tcp_timer.c +++ b/bsd/netinet/tcp_timer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -800,9 +800,16 @@ tcp_timers(tp, timer) tcpstat.tcps_keepprobe++; t_template = tcp_maketemplate(tp); if (t_template) { + unsigned int ifscope; + + if (tp->t_inpcb->inp_flags & INP_BOUND_IF) + ifscope = tp->t_inpcb->inp_boundif; + else + ifscope = IFSCOPE_NONE; + tcp_respond(tp, t_template->tt_ipgen, &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0, NULL); + tp->rcv_nxt, tp->snd_una - 1, 0, ifscope); (void) m_free(dtom(t_template)); } tp->t_timer[TCPT_KEEP] = tcp_keepintvl; diff --git a/bsd/netinet/tcp_usrreq.c b/bsd/netinet/tcp_usrreq.c index d194a867f..9fcfa87e4 100644 --- a/bsd/netinet/tcp_usrreq.c +++ b/bsd/netinet/tcp_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -710,7 +710,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; - tcp_mss(tp, -1); + tcp_mss(tp, -1, IFSCOPE_NONE); } if (flags & PRUS_EOF) { @@ -759,7 +759,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, if (error) goto out; tp->snd_wnd = TTCP_CLIENT_SND_WND; - tcp_mss(tp, -1); + tcp_mss(tp, -1, IFSCOPE_NONE); } tp->snd_up = tp->snd_una + so->so_snd.sb_cc; tp->t_force = 1; diff --git a/bsd/netinet/tcp_var.h b/bsd/netinet/tcp_var.h index 281e250ff..618fc7fed 100644 --- a/bsd/netinet/tcp_var.h +++ b/bsd/netinet/tcp_var.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -688,7 +688,7 @@ struct rmxp_tao * tcp_gettaocache(struct inpcb *); void tcp_init(void) __attribute__((section("__TEXT, initcode"))); void tcp_input(struct mbuf *, int); -void tcp_mss(struct tcpcb *, int); +void tcp_mss(struct tcpcb *, int, unsigned int); int tcp_mssopt(struct tcpcb *); void tcp_drop_syn_sent(struct inpcb *, int); void tcp_mtudisc(struct inpcb *, int); @@ -697,9 +697,9 @@ struct tcpcb * int tcp_output(struct tcpcb *); void tcp_quench(struct inpcb *, int); void tcp_respond(struct tcpcb *, void *, - struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int, ifnet_t); -struct rtentry * - tcp_rtlookup(struct inpcb *); + struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int, + unsigned int); +struct rtentry *tcp_rtlookup(struct inpcb *, unsigned int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * diff --git a/bsd/netinet/udp_usrreq.c b/bsd/netinet/udp_usrreq.c index 88e5413f5..ec3ff435e 100644 --- a/bsd/netinet/udp_usrreq.c +++ b/bsd/netinet/udp_usrreq.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1008,10 +1008,11 @@ udp_output(inp, m, addr, control, p) struct sockaddr_in *ifaddr; int error = 0, udp_dodisconnect = 0; struct socket *so = inp->inp_socket; - int soopts; + int soopts = 0; struct mbuf *inpopts; struct ip_moptions *mopts; struct route ro; + struct ip_out_args ipoa; KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0,0,0,0,0); @@ -1027,15 +1028,17 @@ udp_output(inp, m, addr, control, p) goto release; } + lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); + + /* If socket was bound to an ifindex, tell ip_output about it */ + ipoa.ipoa_ifscope = (inp->inp_flags & INP_BOUND_IF) ? + inp->inp_boundif : IFSCOPE_NONE; + soopts |= IP_OUTARGS; + /* If there was a routing change, discard cached route and check * that we have a valid source address. * Reacquire a new source address if INADDR_ANY was specified */ - -#if 1 - lck_mtx_assert(inp->inpcb_mtx, LCK_MTX_ASSERT_OWNED); -#endif - if (inp->inp_route.ro_rt && inp->inp_route.ro_rt->generation_id != route_generation) { if (ifa_foraddr(inp->inp_laddr.s_addr) == 0) { /* src address is gone */ if (inp->inp_flags & INP_INADDR_ANY) @@ -1158,7 +1161,7 @@ udp_output(inp, m, addr, control, p) m->m_pkthdr.socket_id = get_socket_id(inp->inp_socket); inpopts = inp->inp_options; - soopts = (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); + soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST)); mopts = inp->inp_moptions; /* We don't want to cache the route for non-connected UDP */ @@ -1170,13 +1173,15 @@ udp_output(inp, m, addr, control, p) socket_unlock(so, 0); /* XXX jgraessley please look at XXX */ error = ip_output_list(m, 0, inpopts, - udp_dodisconnect ? &ro : &inp->inp_route, soopts, mopts, NULL); + udp_dodisconnect ? &ro : &inp->inp_route, soopts, mopts, &ipoa); socket_lock(so, 0); if (udp_dodisconnect) { /* Discard the cached route, if there is one */ - if (ro.ro_rt != NULL) + if (ro.ro_rt != NULL) { rtfree(ro.ro_rt); + ro.ro_rt = NULL; + } in_pcbdisconnect(inp); inp->inp_laddr = origladdr; /* XXX rehash? */ } diff --git a/bsd/netinet6/icmp6.c b/bsd/netinet6/icmp6.c index 11b390413..e847a3319 100644 --- a/bsd/netinet6/icmp6.c +++ b/bsd/netinet6/icmp6.c @@ -1040,6 +1040,16 @@ icmp6_mtudisc_update(ip6cp, validated) if (!validated) return; + /* + * In case the suggested mtu is less than IPV6_MMTU, we + * only need to remember that it was for above mentioned + * "alwaysfrag" case. + * Try to be as close to the spec as possible. + */ + if (mtu < IPV6_MMTU) + mtu = IPV6_MMTU - 8; + + bzero(&sin6, sizeof(sin6)); sin6.sin6_family = PF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); @@ -2061,8 +2071,10 @@ icmp6_reflect(m, off) */ bzero(&ro, sizeof(ro)); src = in6_selectsrc(&sa6_src, NULL, NULL, &ro, NULL, &src_storage, &e); - if (ro.ro_rt) + if (ro.ro_rt) { rtfree(ro.ro_rt); /* XXX: we could use this */ + ro.ro_rt = NULL; + } if (src == NULL) { nd6log((LOG_DEBUG, "icmp6_reflect: source can't be determined: " @@ -2307,10 +2319,9 @@ icmp6_redirect_input(m, off) bcopy(&redtgt6, &sgw.sin6_addr, sizeof(struct in6_addr)); bcopy(&reddst6, &sdst.sin6_addr, sizeof(struct in6_addr)); bcopy(&src6, &ssrc.sin6_addr, sizeof(struct in6_addr)); - rtredirect((struct sockaddr *)&sdst, (struct sockaddr *)&sgw, - (struct sockaddr *)NULL, RTF_GATEWAY | RTF_HOST, - (struct sockaddr *)&ssrc, - (struct rtentry **)NULL); + rtredirect(ifp, (struct sockaddr *)&sdst, + (struct sockaddr *)&sgw, NULL, RTF_GATEWAY | RTF_HOST, + (struct sockaddr *)&ssrc, NULL); } /* finally update cached route in each socket via pfctlinput */ { diff --git a/bsd/netinet6/in6_pcb.c b/bsd/netinet6/in6_pcb.c index f98b3d35f..1b481b21f 100644 --- a/bsd/netinet6/in6_pcb.c +++ b/bsd/netinet6/in6_pcb.c @@ -758,8 +758,10 @@ in6_pcbdetach(inp) m_freem(inp->in6p_options); ip6_freepcbopts(inp->in6p_outputopts); ip6_freemoptions(inp->in6p_moptions); - if (inp->in6p_route.ro_rt) + if (inp->in6p_route.ro_rt) { rtfree(inp->in6p_route.ro_rt); + inp->in6p_route.ro_rt = NULL; + } /* Check and free IPv4 related resources in case of mapped addr */ if (inp->inp_options) (void)m_free(inp->inp_options); diff --git a/bsd/netinet6/in6_rmx.c b/bsd/netinet6/in6_rmx.c index fdaf9143f..178dd14d1 100644 --- a/bsd/netinet6/in6_rmx.c +++ b/bsd/netinet6/in6_rmx.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -133,6 +133,9 @@ static void in6_rtqtimo(void *rock); static void in6_mtutimo(void *rock); extern int tvtohz(struct timeval *); +static struct radix_node *in6_matroute_args(void *, struct radix_node_head *, + rn_matchf_t *, void *); + #define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ /* @@ -236,15 +239,25 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, return ret; } +/* + * Similar to in6_matroute_args except without the leaf-matching parameters. + */ +static struct radix_node * +in6_matroute(void *v_arg, struct radix_node_head *head) +{ + return (in6_matroute_args(v_arg, head, NULL, NULL)); +} + /* * This code is the inverse of in6_clsroute: on first reference, if we * were managing the route, stop doing so and set the expiration timer * back off again. */ static struct radix_node * -in6_matroute(void *v_arg, struct radix_node_head *head) +in6_matroute_args(void *v_arg, struct radix_node_head *head, + rn_matchf_t *f, void *w) { - struct radix_node *rn = rn_match(v_arg, head); + struct radix_node *rn = rn_match_args(v_arg, head, f, w); struct rtentry *rt = (struct rtentry *)rn; if (rt && rt->rt_refcnt == 0) { /* this is first reference */ @@ -253,7 +266,7 @@ in6_matroute(void *v_arg, struct radix_node_head *head) rt->rt_rmx.rmx_expire = 0; } } - return rn; + return (rn); } SYSCTL_DECL(_net_inet6_ip6); @@ -527,6 +540,7 @@ in6_inithead(void **head, int off) rnh = *head; rnh->rnh_addaddr = in6_addroute; rnh->rnh_matchaddr = in6_matroute; + rnh->rnh_matchaddr_args = in6_matroute_args; rnh->rnh_close = in6_clsroute; in6_rtqtimo(rnh); /* kick off timeout first time */ in6_mtutimo(rnh); /* kick off timeout first time */ diff --git a/bsd/netinet6/ip6_fw.c b/bsd/netinet6/ip6_fw.c index 7d5aa9b72..6abcb4cf2 100644 --- a/bsd/netinet6/ip6_fw.c +++ b/bsd/netinet6/ip6_fw.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -866,7 +866,7 @@ got_match: } bcopy(&ti, ip6, sizeof(ti)); tcp_respond(NULL, ip6, (struct tcphdr *)(ip6 + 1), - *m, ack, seq, flags, NULL); + *m, ack, seq, flags, IFSCOPE_NONE); *m = NULL; break; } diff --git a/bsd/netinet6/ip6_output.c b/bsd/netinet6/ip6_output.c index 9e41d205f..b4c6491de 100644 --- a/bsd/netinet6/ip6_output.c +++ b/bsd/netinet6/ip6_output.c @@ -1154,8 +1154,10 @@ done: lck_mtx_unlock(ip6_mutex); if (ro == &ip6route && ro->ro_rt) { /* brace necessary for rtfree */ rtfree(ro->ro_rt); + ro->ro_rt = NULL; } else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) { rtfree(ro_pmtu->ro_rt); + ro_pmtu->ro_rt = NULL; } #if IPSEC @@ -2174,6 +2176,7 @@ ip6_setmoptions( } ifp = ro.ro_rt->rt_ifp; rtfree(ro.ro_rt); + ro.ro_rt = NULL; } } else ifp = ifindex2ifnet[mreq->ipv6mr_interface]; diff --git a/bsd/sys/disk.h b/bsd/sys/disk.h index 879dcb22b..348253107 100644 --- a/bsd/sys/disk.h +++ b/bsd/sys/disk.h @@ -50,6 +50,7 @@ * DKIOCISFORMATTED is media formatted? * DKIOCISWRITABLE is media writable? * + * DKIOCREQUESTIDLE idle media * DKIOCDISCARD delete unused data * * DKIOCGETMAXBLOCKCOUNTREAD get maximum block count for reads @@ -114,6 +115,7 @@ typedef struct #define DKIOCISFORMATTED _IOR('d', 23, uint32_t) #define DKIOCISWRITABLE _IOR('d', 29, uint32_t) +#define DKIOCREQUESTIDLE _IO('d', 30) #define DKIOCDISCARD _IOW('d', 31, dk_discard_t) #define DKIOCGETMAXBLOCKCOUNTREAD _IOR('d', 64, uint64_t) diff --git a/bsd/sys/dtrace.h b/bsd/sys/dtrace.h index 9be90489b..ebe0e4f49 100644 --- a/bsd/sys/dtrace.h +++ b/bsd/sys/dtrace.h @@ -2385,6 +2385,8 @@ extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t)); #define DTRACE_INVOP_BCTR 6 #define DTRACE_INVOP_TAILJUMP 7 #endif + + #endif /* __APPLE__ */ #ifdef __cplusplus diff --git a/bsd/sys/fcntl.h b/bsd/sys/fcntl.h index 200b3e7ac..98b5c7c42 100644 --- a/bsd/sys/fcntl.h +++ b/bsd/sys/fcntl.h @@ -258,6 +258,7 @@ typedef __darwin_pid_t pid_t; #define F_FLOCK 0x020 /* Use flock(2) semantics for lock */ #define F_POSIX 0x040 /* Use POSIX semantics for lock */ #define F_PROV 0x080 /* Non-coelesced provisional lock */ +#define F_WAKE1_SAFE 0x100 /* its safe to only wake one waiter */ #endif /* diff --git a/bsd/sys/lockf.h b/bsd/sys/lockf.h index f5f8ad03a..df4dec9c1 100644 --- a/bsd/sys/lockf.h +++ b/bsd/sys/lockf.h @@ -90,6 +90,7 @@ struct lockf { off_t lf_start; /* Byte # of the start of the lock */ off_t lf_end; /* Byte # of the end of the lock (-1=EOF) */ caddr_t lf_id; /* Id of the resource holding the lock */ + uint32_t lf_waiters; /* count of waiters on this lock */ struct lockf **lf_head; /* Back pointer to the head of the locf list */ struct vnode *lf_vnode; /* Back pointer to the inode */ struct lockf *lf_next; /* Pointer to the next lock on this inode */ diff --git a/bsd/sys/lockstat.h b/bsd/sys/lockstat.h index 72d3ca1c8..a7c659970 100644 --- a/bsd/sys/lockstat.h +++ b/bsd/sys/lockstat.h @@ -187,7 +187,6 @@ extern void (*lockstat_probe)(dtrace_id_t, uint64_t, uint64_t, #if CONFIG_DTRACE extern int lockstat_depth(void); extern void lockstat_hot_patch(boolean_t); -extern void dtrace_membar_producer(void); /* * Macros to record lockstat probes. diff --git a/bsd/sys/mbuf.h b/bsd/sys/mbuf.h index ec7c2733e..779dbde50 100644 --- a/bsd/sys/mbuf.h +++ b/bsd/sys/mbuf.h @@ -399,6 +399,8 @@ union m16kcluster { /* compatiblity with 4.3 */ #define m_copy(m, o, l) m_copym((m), (o), (l), M_DONTWAIT) +#define MBSHIFT 20 /* 1MB */ + #endif /* KERNEL_PRIVATE */ /* diff --git a/bsd/sys/vnode.h b/bsd/sys/vnode.h index 7aef5e9e8..5814b6eea 100644 --- a/bsd/sys/vnode.h +++ b/bsd/sys/vnode.h @@ -524,6 +524,7 @@ int vnode_ischr(vnode_t); #ifdef __APPLE_API_UNSTABLE int vnode_isnamedstream(vnode_t); +int vnode_isshadow(vnode_t); #endif enum vtype vnode_iftovt(int); diff --git a/bsd/sys/vnode_internal.h b/bsd/sys/vnode_internal.h index 66e32d7c3..8948d8310 100644 --- a/bsd/sys/vnode_internal.h +++ b/bsd/sys/vnode_internal.h @@ -227,7 +227,9 @@ struct vnode { #define VAGE 0x001000 /* Insert vnode at head of free list */ #define VRAOFF 0x002000 /* read ahead disabled */ #define VNCACHEABLE 0x004000 /* vnode is allowed to be put back in name cache */ -#define VUINACTIVE 0x008000 /* UBC vnode is on inactive list */ +#if NAMEDSTREAMS +#define VISSHADOW 0x008000 /* vnode is a shadow file */ +#endif #define VSWAP 0x010000 /* vnode is being used as swapfile */ #define VTHROTTLED 0x020000 /* writes or pageouts have been throttled */ /* wakeup tasks waiting when count falls below threshold */ diff --git a/bsd/vfs/kpi_vfs.c b/bsd/vfs/kpi_vfs.c index 79e526ef2..44c482c8f 100644 --- a/bsd/vfs/kpi_vfs.c +++ b/bsd/vfs/kpi_vfs.c @@ -1599,6 +1599,22 @@ vnode_isnamedstream( #endif } +int +vnode_isshadow( +#if NAMEDSTREAMS + vnode_t vp +#else + __unused vnode_t vp +#endif + ) +{ +#if NAMEDSTREAMS + return ((vp->v_flag & VISSHADOW) ? 1 : 0); +#else + return (0); +#endif +} + /* TBD: set vnode_t to not cache data after it is consumed once; used for quota */ void vnode_setnocache(vnode_t vp) @@ -4366,7 +4382,7 @@ VNOP_INACTIVE(struct vnode *vp, vfs_context_t ctx) */ if (vnode_isnamedstream(vp) && (vp->v_parent != NULLVP) && - ((vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) && + (vnode_isshadow(vp)) && ((vp->v_lflag & VL_TERMINATE) == 0)) { vnode_recycle(vp); } diff --git a/bsd/vfs/vfs_journal.c b/bsd/vfs/vfs_journal.c index eb070de33..6e53e0169 100644 --- a/bsd/vfs/vfs_journal.c +++ b/bsd/vfs/vfs_journal.c @@ -1704,8 +1704,28 @@ journal_open(struct vnode *jvp, } if (phys_blksz != (size_t)jnl->jhdr->jhdr_size && jnl->jhdr->jhdr_size != 0) { - printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d\n", - jdev_name, phys_blksz, jnl->jhdr->jhdr_size); + /* + * The volume has probably been resized (such that we had to adjust the + * logical sector size), or copied to media with a different logical + * sector size. If the journal is empty, then just switch to the + * current logical sector size. If the journal is not empty, then + * fail to open the journal. + */ + + if (jnl->jhdr->start == jnl->jhdr->end) { + int err; + printf("jnl: %s: open: changing journal header size from %d to %lu\n", + jdev_name, jnl->jhdr->jhdr_size, phys_blksz); + jnl->jhdr->jhdr_size = phys_blksz; + if (write_journal_header(jnl)) { + printf("jnl: %s: open: failed to update journal header size\n", jdev_name); + goto bad_journal; + } + } else { + printf("jnl: %s: open: phys_blksz %lu does not match journal header size %d, and journal is not empty!\n", + jdev_name, phys_blksz, jnl->jhdr->jhdr_size); + goto bad_journal; + } } if ( jnl->jhdr->start <= 0 diff --git a/bsd/vfs/vfs_lookup.c b/bsd/vfs/vfs_lookup.c index bb8c5dd2b..0c5299ae6 100644 --- a/bsd/vfs/vfs_lookup.c +++ b/bsd/vfs/vfs_lookup.c @@ -935,10 +935,22 @@ nextname: } switch (cnp->cn_nameiop) { case DELETE: - nsop = NS_DELETE; + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_DELETE; + } + else { + error = EPERM; + goto bad; + } break; case CREATE: - nsop = NS_CREATE; + if (cnp->cn_flags & CN_ALLOWRSRCFORK) { + nsop = NS_CREATE; + } + else { + error = EPERM; + goto bad; + } break; case LOOKUP: /* Make sure our lookup of "/..namedfork/rsrc" is allowed. */ diff --git a/bsd/vfs/vfs_subr.c b/bsd/vfs/vfs_subr.c index c5d91125d..bfee0d8b4 100644 --- a/bsd/vfs/vfs_subr.c +++ b/bsd/vfs/vfs_subr.c @@ -1195,8 +1195,6 @@ insmntque(vnode_t vp, mount_t mp) TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); if (vp->v_lflag & VNAMED_MOUNT) panic("insmntque: vp already in mount vnode list"); - if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) - panic("insmntque: vp on the free list\n"); vp->v_lflag |= VNAMED_MOUNT; mount_ref(mp, 1); mount_unlock(mp); @@ -1976,7 +1974,7 @@ vclean(vnode_t vp, int flags) /* Delete the shadow stream file before we reclaim its vnode */ if ((is_namedstream != 0) && (vp->v_parent != NULLVP) && - ((vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)) { + (vnode_isshadow(vp))) { vnode_relenamedstream(vp->v_parent, vp, ctx); } #endif @@ -4019,6 +4017,9 @@ vnode_create(int flavor, size_t size, void *data, vnode_t *vpp) if (param->vnfs_mp->mnt_kern_flag & MNTK_LOCK_LOCAL) vp->v_flag |= VLOCKLOCAL; if (insert) { + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb)) + panic("insmntque: vp on the free list\n"); + /* * enter in mount vnode list */ diff --git a/bsd/vfs/vfs_syscalls.c b/bsd/vfs/vfs_syscalls.c index be9bfe17a..869f3f5b3 100644 --- a/bsd/vfs/vfs_syscalls.c +++ b/bsd/vfs/vfs_syscalls.c @@ -2938,6 +2938,11 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) int need_event = 0; int has_listeners = 0; +#if NAMEDRSRCFORK + /* unlink or delete is allowed on rsrc forks and named streams */ + ndp->ni_cnd.cn_flags |= CN_ALLOWRSRCFORK; +#endif + ndp->ni_cnd.cn_flags |= LOCKPARENT; cnp = &ndp->ni_cnd; @@ -3051,6 +3056,15 @@ unlink1(vfs_context_t ctx, struct nameidata *ndp, int nodelbusy) * since it may need to release the fs_nodelock on the dvp */ out: +#if NAMEDRSRCFORK + /* recycle deleted rsrc fork to force reclaim on shadow file if necessary */ + if ((vnode_isnamedstream(ndp->ni_vp)) && + (ndp->ni_vp->v_parent != NULLVP) && + (vnode_isshadow(ndp->ni_vp))) { + vnode_recycle(ndp->ni_vp); + } +#endif + nameidone(ndp); vnode_put(dvp); vnode_put(vp); @@ -3540,7 +3554,7 @@ access(__unused proc_t p, struct access_args *uap, __unused register_t *retval) */ if (vnode_isnamedstream(nd.ni_vp) && (nd.ni_vp->v_parent != NULLVP) && - ((nd.ni_vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)) { + (vnode_isshadow(nd.ni_vp))) { is_namedstream = 1; vnode_ref(nd.ni_vp); } @@ -3606,7 +3620,7 @@ stat2(vfs_context_t ctx, struct nameidata *ndp, user_addr_t ub, user_addr_t xsec */ if (vnode_isnamedstream(ndp->ni_vp) && (ndp->ni_vp->v_parent != NULLVP) && - ((ndp->ni_vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)) { + (vnode_isshadow(ndp->ni_vp))) { is_namedstream = 1; vnode_ref (ndp->ni_vp); } @@ -4593,7 +4607,7 @@ fsync_nocancel(proc_t p, struct fsync_nocancel_args *uap, __unused register_t *r if ((error == 0) && (vp->v_flag & VISNAMEDSTREAM) && (vp->v_parent != NULLVP) && - !(vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) && + (vnode_isshadow(vp)) && (fp->f_flags & FP_WRITTEN)) { (void) vnode_flushnamedstream(vp->v_parent, vp, ctx); } diff --git a/bsd/vfs/vfs_vnops.c b/bsd/vfs/vfs_vnops.c index a8fc43f7c..0eb1036ad 100644 --- a/bsd/vfs/vfs_vnops.c +++ b/bsd/vfs/vfs_vnops.c @@ -424,7 +424,13 @@ bad2: bad: ndp->ni_vp = NULL; if (vp) { - vnode_put(vp); +#if NAMEDRSRCFORK + if ((vnode_isnamedstream(vp)) && (vp->v_parent != NULLVP) && + (vnode_isshadow (vp))) { + vnode_recycle(vp); + } +#endif + vnode_put(vp); /* * Check for a race against unlink. We had a vnode * but according to vnode_authorize or VNOP_OPEN it @@ -489,7 +495,7 @@ vn_close(struct vnode *vp, int flags, vfs_context_t ctx) /* Sync data from resource fork shadow file if needed. */ if ((vp->v_flag & VISNAMEDSTREAM) && (vp->v_parent != NULLVP) && - !(vp->v_parent->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS)) { + (vnode_isshadow(vp))) { if (flags & FWASWRITTEN) { (void) vnode_flushnamedstream(vp->v_parent, vp, ctx); } diff --git a/bsd/vfs/vfs_xattr.c b/bsd/vfs/vfs_xattr.c index 5c59cfbc8..43f8991d8 100644 --- a/bsd/vfs/vfs_xattr.c +++ b/bsd/vfs/vfs_xattr.c @@ -394,11 +394,16 @@ vnode_getnamedstream(vnode_t vp, vnode_t *svpp, const char *name, enum nsoperati error = default_getnamedstream(vp, svpp, name, op, context); if (error == 0) { + uint32_t streamflags = VISNAMEDSTREAM; vnode_t svp = *svpp; - + + if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { + streamflags |= VISSHADOW; + } + /* Tag the vnode. */ - vnode_lock(svp); - svp->v_flag |= VISNAMEDSTREAM; + vnode_lock_spin(svp); + svp->v_flag |= streamflags; vnode_unlock(svp); /* Make the file its parent. * Note: This parent link helps us distinguish vnodes for @@ -427,12 +432,19 @@ vnode_makenamedstream(vnode_t vp, vnode_t *svpp, const char *name, int flags, vf error = default_makenamedstream(vp, svpp, name, context); if (error == 0) { + uint32_t streamflags = VISNAMEDSTREAM; vnode_t svp = *svpp; /* Tag the vnode. */ - vnode_lock(svp); - svp->v_flag |= VISNAMEDSTREAM; - vnode_unlock(svp); + if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) { + streamflags |= VISSHADOW; + } + + /* Tag the vnode. */ + vnode_lock_spin(svp); + svp->v_flag |= streamflags; + vnode_unlock(svp); + /* Make the file its parent. * Note: This parent link helps us distinguish vnodes for * shadow stream files from vnodes for resource fork on file diff --git a/config/Libkern.exports b/config/Libkern.exports index cb7928a5a..8438edd9c 100644 --- a/config/Libkern.exports +++ b/config/Libkern.exports @@ -809,13 +809,10 @@ _sha1_init:_SHA1Init _sha1_loop:_SHA1Update _sha1_result:_SHA1Final_r _snprintf -_sprintf _sscanf _strcasecmp -_strcat _strchr _strcmp -_strcpy _STRDUP _strlen _strncasecmp diff --git a/config/Libkern.i386.exports b/config/Libkern.i386.exports index 362488593..d7f49b799 100644 --- a/config/Libkern.i386.exports +++ b/config/Libkern.i386.exports @@ -1,2 +1,5 @@ _OSCompareAndSwap64 _OSAddAtomic64 +_strcpy +_strcat +_sprintf diff --git a/config/Libkern.ppc.exports b/config/Libkern.ppc.exports index df175fdcc..4531e8434 100644 --- a/config/Libkern.ppc.exports +++ b/config/Libkern.ppc.exports @@ -18,4 +18,6 @@ __ZN8OSObject19_RESERVEDOSObject30Ev __ZN8OSObject19_RESERVEDOSObject31Ev _bcopy_nc _bzero_nc - +_strcpy +_strcat +_sprintf diff --git a/config/Makefile b/config/Makefile index 78fb43d85..db72f8acd 100644 --- a/config/Makefile +++ b/config/Makefile @@ -93,7 +93,7 @@ $(OBJPATH)/allsymbols: $(OBJPATH)/mach_kernel $(SYMBOL_SET_BUILD): $(OBJPATH)/%.symbolset : %.exports %.$(ARCH_CONFIG_LC).exports $(OBJPATH)/allsymbols $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ -import $(OBJPATH)/allsymbols \ -export $*.exports \ -export $*.$(ARCH_CONFIG_LC).exports \ @@ -109,7 +109,7 @@ endif build_symbol_sets: $(SYMBOL_SET_BUILD) $(_v)$(KEXT_CREATE_SYMBOL_SET) \ - $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ + $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_ALL_)) \ -import $(OBJPATH)/allsymbols \ -export $(SRCROOT)/$(COMPONENT)/Libkern.exports \ -export $(SRCROOT)/$(COMPONENT)/Libkern.$(ARCH_CONFIG_LC).exports \ diff --git a/config/MasterVersion b/config/MasterVersion index 38648f07c..58454343e 100644 --- a/config/MasterVersion +++ b/config/MasterVersion @@ -1,4 +1,4 @@ -9.6.0 +9.7.0 # The first line of this file contains the master version number for the kernel. # All other instances of the kernel version in xnu are derived from this file. diff --git a/config/System6.0.exports b/config/System6.0.exports index efaa7c605..d4882c538 100644 --- a/config/System6.0.exports +++ b/config/System6.0.exports @@ -3266,13 +3266,10 @@ _splsoftclock _spltty _splvm _splx -_sprintf _sscanf _stack_privilege -_strcat _strchr _strcmp -_strcpy _strlen _strncat _strncmp diff --git a/config/System6.0.i386.exports b/config/System6.0.i386.exports index e558b7286..e876d829f 100644 --- a/config/System6.0.i386.exports +++ b/config/System6.0.i386.exports @@ -22,3 +22,6 @@ _rtc_clock_stepping _smp_initialized __ZN24IOBufferMemoryDescriptor20initWithPhysicalMaskEP4taskmyyy __ZN24IOBufferMemoryDescriptor22inTaskWithPhysicalMaskEP4taskmyy +_strcat +_strcpy +_sprintf diff --git a/config/System6.0.ppc.exports b/config/System6.0.ppc.exports index 66b5ad2e0..000f473b0 100644 --- a/config/System6.0.ppc.exports +++ b/config/System6.0.ppc.exports @@ -248,3 +248,6 @@ _pmsRunLocal _scc _rc4_crypt _rc4_init +_strcat +_strcpy +_sprintf diff --git a/iokit/IOKit/IOBufferMemoryDescriptor.h b/iokit/IOKit/IOBufferMemoryDescriptor.h index f5922d5a5..f42f66489 100644 --- a/iokit/IOKit/IOBufferMemoryDescriptor.h +++ b/iokit/IOKit/IOBufferMemoryDescriptor.h @@ -36,7 +36,13 @@ enum { kIOMemoryPurgeable = 0x00000040, kIOMemorySharingTypeMask = 0x000f0000, kIOMemoryUnshared = 0x00000000, - kIOMemoryKernelUserShared = 0x00010000 + kIOMemoryKernelUserShared = 0x00010000, + // shared IOMemoryDescriptor options for IOBufferMemoryDescriptor: + kIOBufferDescriptorMemoryFlags = kIOMemoryDirectionMask +#ifdef XNU_KERNEL_PRIVATE + | kIOMemoryAutoPrepare +#endif + | kIOMemoryThreadSafe }; #define _IOBUFFERMEMORYDESCRIPTOR_INTASKWITHOPTIONS_ 1 diff --git a/iokit/IOKit/IOMemoryDescriptor.h b/iokit/IOKit/IOMemoryDescriptor.h index 080f692f5..ea04c67e2 100644 --- a/iokit/IOKit/IOMemoryDescriptor.h +++ b/iokit/IOKit/IOMemoryDescriptor.h @@ -78,8 +78,15 @@ enum { kIOMemoryAsReference = 0x00000100, kIOMemoryBufferPageable = 0x00000400, kIOMemoryDontMap = 0x00000800, +#ifdef XNU_KERNEL_PRIVATE + kIOMemoryRedirected = 0x00004000, + kIOMemoryPreparedReadOnly = 0x00008000, +#endif kIOMemoryPersistent = 0x00010000, - kIOMemoryThreadSafe = 0x00020000 +#ifdef XNU_KERNEL_PRIVATE + kIOMemoryReserved6156215 = 0x00020000, +#endif + kIOMemoryThreadSafe = 0x00100000, // Shared with Buffer MD }; #define kIOMapperNone ((IOMapper *) -1) @@ -742,13 +749,6 @@ public: // might be created by IOMemoryDescriptor::withAddress(), but there should be // no need to reference as anything but a generic IOMemoryDescriptor *. -// Also these flags should not overlap with the options to -// IOMemoryDescriptor::initWithRanges(... IOOptionsBits options); - -enum { - kIOMemoryPreparedReadOnly = 0x00008000, -}; - class IOGeneralMemoryDescriptor : public IOMemoryDescriptor { OSDeclareDefaultStructors(IOGeneralMemoryDescriptor); diff --git a/iokit/Kernel/IOBufferMemoryDescriptor.cpp b/iokit/Kernel/IOBufferMemoryDescriptor.cpp index 5fbfc6715..8358a9537 100644 --- a/iokit/Kernel/IOBufferMemoryDescriptor.cpp +++ b/iokit/Kernel/IOBufferMemoryDescriptor.cpp @@ -31,6 +31,7 @@ #include #include #include +#include #include "IOKitKernelInternal.h" #include "IOCopyMapper.h" @@ -132,8 +133,8 @@ bool IOBufferMemoryDescriptor::initWithPhysicalMask( range.length = 0; _ranges.v64 = ⦥ - // Grab the direction and the Auto Prepare bits from the Buffer MD options - iomdOptions |= options & (kIOMemoryDirectionMask | kIOMemoryAutoPrepare); + // Grab IOMD bits from the Buffer MD options + iomdOptions |= (options & kIOBufferDescriptorMemoryFlags); if ((options & (kIOMemorySharingTypeMask | kIOMapCacheMask)) && (alignment < page_size)) alignment = page_size; diff --git a/iokit/Kernel/IOCatalogue.cpp b/iokit/Kernel/IOCatalogue.cpp index 0b5f54a25..7ca1e8c46 100644 --- a/iokit/Kernel/IOCatalogue.cpp +++ b/iokit/Kernel/IOCatalogue.cpp @@ -1251,6 +1251,7 @@ IOReturn IOCatalogue::unloadModule( OSString * moduleName ) const name = moduleName->getCStringNoCopy(); k_info = kmod_lookupbyname_locked((char *)name); if ( k_info && (k_info->reference_count < 1) ) { + record_kext_unload(k_info->id); if ( k_info->stop && !((ret = k_info->stop(k_info, 0)) == kIOReturnSuccess) ) { diff --git a/iokit/Kernel/IOHibernateIO.cpp b/iokit/Kernel/IOHibernateIO.cpp index ae66cb9b8..9105cdbb9 100644 --- a/iokit/Kernel/IOHibernateIO.cpp +++ b/iokit/Kernel/IOHibernateIO.cpp @@ -2145,6 +2145,9 @@ hibernate_write_image(void) uncompressedSize ? ((int) ((compressedSize * 100ULL) / uncompressedSize)) : 0, sum1, sum2); + if (vars->fileVars->io) + (void) IOHibernatePollerIODone(vars->fileVars, false); + if (pollerOpen) IOHibernatePollerClose(vars->fileVars, kIOPolledBeforeSleepState); diff --git a/iokit/Kernel/IOMemoryDescriptor.cpp b/iokit/Kernel/IOMemoryDescriptor.cpp index b86f7f651..58e7190e6 100644 --- a/iokit/Kernel/IOMemoryDescriptor.cpp +++ b/iokit/Kernel/IOMemoryDescriptor.cpp @@ -813,6 +813,12 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers, gIOSystemMapper = mapper = IOMapper::gSystem; } + // Temp binary compatibility for kIOMemoryThreadSafe + if (kIOMemoryReserved6156215 & options) + { + options &= ~kIOMemoryReserved6156215; + options |= kIOMemoryThreadSafe; + } // Remove the dynamic internal use flags from the initial setting options &= ~(kIOMemoryPreparedReadOnly); _flags = options; @@ -2566,10 +2572,6 @@ IOReturn IOMemoryDescriptor::doMap( return (err); } -enum { - kIOMemoryRedirected = 0x00010000 -}; - IOReturn IOMemoryDescriptor::handleFault( void * _pager, vm_map_t addressMap, diff --git a/iokit/Kernel/IOTimerEventSource.cpp b/iokit/Kernel/IOTimerEventSource.cpp index aca7f3fbc..d32a17887 100644 --- a/iokit/Kernel/IOTimerEventSource.cpp +++ b/iokit/Kernel/IOTimerEventSource.cpp @@ -299,7 +299,7 @@ IOReturn IOTimerEventSource::wakeAtTime(AbsoluteTime inAbstime) return kIOReturnNoResources; abstime = inAbstime; - if ( enabled && AbsoluteTime_to_scalar(&abstime) && workLoop ) + if ( enabled && AbsoluteTime_to_scalar(&inAbstime) && AbsoluteTime_to_scalar(&abstime) && workLoop ) { if (reserved) { @@ -308,14 +308,14 @@ IOReturn IOTimerEventSource::wakeAtTime(AbsoluteTime inAbstime) reserved->workLoop = workLoop; reserved->calloutGeneration++; if (thread_call_enter1_delayed((thread_call_t) calloutEntry, - (void *) reserved->calloutGeneration, abstime)) + (void *) reserved->calloutGeneration, inAbstime)) { release(); workLoop->release(); } } else - thread_call_enter_delayed((thread_call_t) calloutEntry, abstime); + thread_call_enter_delayed((thread_call_t) calloutEntry, inAbstime); } return kIOReturnSuccess; diff --git a/iokit/Kernel/IOUserClient.cpp b/iokit/Kernel/IOUserClient.cpp index bb451f8c4..940197e34 100644 --- a/iokit/Kernel/IOUserClient.cpp +++ b/iokit/Kernel/IOUserClient.cpp @@ -210,7 +210,7 @@ bool IOMachPort::noMoreSendersForObject( OSObject * obj, machPort = (IOMachPort *) dict->getObject( (const OSSymbol *) obj ); if( machPort) { - destroyed = (machPort->mscount == *mscount); + destroyed = (machPort->mscount <= *mscount); if( destroyed) dict->removeObject( (const OSSymbol *) obj ); else diff --git a/kgmacros b/kgmacros index 1fa767c15..7623092e2 100644 --- a/kgmacros +++ b/kgmacros @@ -159,6 +159,11 @@ document kgm | kdp-reenter Schedule reentry into the debugger and continue. | kdp-reboot Restart remote target | +| zstack Print zalloc caller stack (zone leak debugging) +| findoldest Find oldest zone leak debugging record +| countpcs Print how often a pc occurs in the zone leak log +| +| | Type "help " for more specific help on a particular macro. | Type "show user " to see what the macro is really doing. end @@ -454,6 +459,7 @@ end define showcurrentthreads set $kgm_prp = (struct processor *)processor_list while $kgm_prp != 0 + printf "Processor 0x%08x State %d (cpu_id %x)\n", $kgm_prp, ($kgm_prp)->state, ($kgm_prp)->cpu_num if ($kgm_prp)->active_thread != 0 set $kgm_actp = ($kgm_prp)->active_thread showtaskheader @@ -504,6 +510,7 @@ end define showcurrentstacks set $kgm_prp = processor_list while $kgm_prp != 0 + printf "Processor 0x%08x State %d (cpu_id %x)\n", $kgm_prp, ($kgm_prp)->state, ($kgm_prp)->cpu_num if ($kgm_prp)->active_thread != 0 set $kgm_actp = ($kgm_prp)->active_thread showtaskheader @@ -2539,6 +2546,10 @@ define showobjectint set $kgm_obj = (OSObject *) $arg1 set $kgm_vt = *((void **) $arg1) + if ($kgm_mtype == 12) + set $kgm_vt = $kgm_vt - 2 * sizeof(void *) + end + if ($kgm_show_object_addrs) printf "`object %p, vt ", $arg1 output /a (unsigned) $kgm_vt @@ -2668,6 +2679,9 @@ define showregistryentryrecurse printf " | For page-tables in translate to physical address. end +define zstack + set $index = $arg0 + + if (log_records == 0) + set $count = 0 + printf "Zone logging not enabled. Add 'zlog=' to boot-args.\n" + else + if ($argc == 2) + set $count = $arg1 + else + set $count = 1 + end + end + + while ($count) + printf "\n--------------- " + + if (zrecords[$index].z_opcode == 1) + printf "ALLOC " + else + printf "FREE " + end + + printf " 0x%x : index %d : ztime %d -------------\n", zrecords[$index].z_element, $index, zrecords[$index].z_time + + set $frame = 0 + + while ($frame < 15) + set $frame_pc = zrecords[$index].z_pc[$frame] + + if ($frame_pc == 0) + loop_break + end + + x/i $frame_pc + set $frame = $frame + 1 + end + + set $index = $index + 1 + set $count = $count - 1 + end +end + +document zstack +Syntax: (gdb) zstack [] +| Zone leak debugging: print the stack trace of log element at . +| If a is supplied, it prints log elements starting at . +| +| The suggested usage is to look at indexes below zcurrent and look for common stack traces. +| The stack trace that occurs the most is probably the cause of the leak. Find the pc of the +| function calling into zalloc and use the countpcs kgmacro to find out how often that pc occurs in the log. +| The pc occuring in a high percentage of records is most likely the source of the leak. +| +| The findoldest kgmacro is also useful for leak debugging since it identifies the oldest record +| in the log, which may indicate the leaker. +end + +define findoldest + set $index = 0 + set $count = log_records + set $cur_min = 2000000000 + set $cur_index = 0 + + if (log_records == 0) + printf "Zone logging not enabled. Add 'zlog=' to boot-args.\n" + else + + while ($count) + if (zrecords[$index].z_element && zrecords[$index].z_time < $cur_min) + set $cur_index = $index + set $cur_min = zrecords[$index].z_time + end + + set $count = $count - 1 + set $index = $index + 1 + end + + printf "oldest record is at log index %d:\n", $cur_index + zstack $cur_index + end +end + +document findoldest +Syntax: (gdb) findoldest +| Zone leak debugging: find and print the oldest record in the log. Note that this command +| can take several minutes to run since it uses linear search. +| +| Once it prints a stack trace, find the pc of the caller above all the zalloc, kalloc and +| IOKit layers. Then use the countpcs kgmacro to see how often this caller has allocated +| memory. A caller with a high percentage of records in the log is probably the leaker. +end + +define countpcs + set $target_pc = $arg0 + set $index = 0 + set $count = log_records + set $found = 0 + + if (log_records == 0) + printf "Zone logging not enabled. Add 'zlog=' to boot-args.\n" + else + + while ($count) + set $frame = 0 + + if (zrecords[$index].z_element != 0) + while ($frame < 15) + if (zrecords[$index].z_pc[$frame] == $target_pc) + set $found = $found + 1 + set $frame = 15 + end + + set $frame = $frame + 1 + end + end + + set $index = $index + 1 + set $count = $count - 1 + end + + printf "occurred %d times in log (%d%c of records)\n", $found, ($found * 100) / zrecorded, '%' + end +end + +document countpcs +Syntax: (gdb) countpcs +| Zone leak debugging: search the log and print a count of all log entries that contain the given +| in the stack trace. This is useful for verifying a suspected as being the source of +| the leak. If a high percentage of the log entries contain the given , then it's most +| likely the source of the leak. Note that this command can take several minutes to run. +end + +define findelem + set $fe_index = zcurrent + set $fe_count = log_records + set $fe_elem = $arg0 + set $fe_prev_op = -1 + + if (log_records == 0) + printf "Zone logging not enabled. Add 'zlog=' to boot-args.\n" + end + + while ($fe_count) + if (zrecords[$fe_index].z_element == $fe_elem) + zstack $fe_index + + if (zrecords[$fe_index].z_opcode == $fe_prev_op) + printf "*************** DOUBLE OP! *********************\n + end + + set $fe_prev_op = zrecords[$fe_index].z_opcode + end + + set $fe_count = $fe_count - 1 + set $fe_index = $fe_index + 1 + + if ($fe_index >= log_records) + set $fe_index = 0 + end + end +end + +document findelem +Syntax: (gdb) findelem +| Zone corruption debugging: search the log and print out the stack traces for all log entries that +| refer to the given zone element. When the kernel panics due to a corrupted zone element, get the +| element address and use this macro. This will show you the stack traces of all logged zalloc and +| zfree operations which tells you who touched the element in the recent past. This also makes +| double-frees readily apparent. +end diff --git a/libkern/Makefile b/libkern/Makefile index 3f1d09b7a..ccff380d6 100644 --- a/libkern/Makefile +++ b/libkern/Makefile @@ -16,7 +16,6 @@ INSTINC_SUBDIRS_I386 = ${INSTINC_SUBDIRS} INSTINC_SUBDIRS_ARM = ${INSTINC_SUBDIRS} - EXPINC_SUBDIRS = \ libkern \ uuid @@ -29,7 +28,7 @@ EXPINC_SUBDIRS_ARM = ${EXPINC_SUBDIRS} SETUP_SUBDIRS = conf -COMP_SUBDIRS = conf +COMP_SUBDIRS = conf kmod INST_SUBDIRS = kmod diff --git a/libkern/c++/OSMetaClass.cpp b/libkern/c++/OSMetaClass.cpp index f8ac932d7..f7f1e0a3b 100644 --- a/libkern/c++/OSMetaClass.cpp +++ b/libkern/c++/OSMetaClass.cpp @@ -652,6 +652,7 @@ static void _OSMetaClassConsiderUnloads(__unused thread_call_param_t p0, classes->release(); if (0 == checkClass) { + record_kext_unload(ki->id); OSRuntimeUnloadCPP(ki, 0); // call destructors ret = kmod_destroy(host_priv_self(), ki->id); didUnload = true; diff --git a/libkern/kmod/Makefile.kmod b/libkern/kmod/Makefile.kmod index fcc97f211..bab4de575 100644 --- a/libkern/kmod/Makefile.kmod +++ b/libkern/kmod/Makefile.kmod @@ -7,7 +7,6 @@ export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir - include $(MakeInc_cmd) include $(MakeInc_def) @@ -28,7 +27,6 @@ COMPOBJROOT = $(OBJROOT)/$(KERNEL_CONFIG)_$(ARCH_CONFIG)/$(COMPONENT)/kmod INSTOBJROOT = $(OBJROOT)/$(INSTALL_TYPE)_$(ARCH_CONFIG)/$(COMPONENT)/kmod endif - KMOD_CFILES = c_start.c c_stop.c KMODCPP_CFILES = cplus_start.c cplus_stop.c @@ -38,36 +36,62 @@ KMODCPP_OFILES = $(KMODCPP_CFILES:.c=.o) ALL_OFILES = $(KMOD_OFILES) $(KMODCPP_OFILES) $(ALL_OFILES): %.o : %.c - ${KCC} -c ${CFLAGS} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $< + @echo CC $@ + $(_v)${KCC} -c ${CFLAGS} ${${join $@,_CFLAGS}} ${INCFLAGS} ${${join $@,_INCFLAGS}} -o $(COMPOBJROOT)/$(*F).o $< $(COMPOBJROOT)/$(KMOD_NAME).a: $(KMOD_OFILES) - libtool -static -o $@ $^ + @echo LIBTOOL $@ + $(_v)libtool -static -o $@ $^ $(COMPOBJROOT)/$(KMODCPP_NAME).a: $(KMODCPP_OFILES) - libtool -static -o $@ $^ + @echo LIBTOOL $@ + $(_v)libtool -static -o $@ $^ do_build_all: $(COMPOBJROOT)/$(KMOD_NAME).a $(COMPOBJROOT)/$(KMODCPP_NAME).a $(INSTALL_DIR)/%.a: $(INSTOBJROOT)/%.a - @allarchs=""; \ - for onearch in $(INSTALL_ARCHS); do \ - if [ $(MACHINE_CONFIG) = DEFAULT ] ; then \ - archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT); \ - else \ - archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}_$(MACHINE_CONFIG)/$(COMPONENT); \ - fi; \ - if [ -e $${archdir}/kmod/$(*F).a ]; then \ - allarchs="$${allarchs} $${archdir}/kmod/$(*F).a"; \ - fi; \ - done; \ + @echo Installing $< in $@; $(RM) $@ || true; \ ${MKDIR} $(INSTALL_DIR) $(SYMROOT); \ - cmd="lipo $${allarchs} -create -output $(SYMROOT)/$(*F).a"; \ - echo $$cmd; eval $$cmd; \ - cmd="install $(LIB_INSTALL_FLAGS) $(SYMROOT)/$(*F).a $@"; \ + $(_v)if [ $(MACHINE_CONFIG) = DEFAULT ] ; then \ + allarchs=""; \ + for onearch in $(INSTALL_ARCHS); do \ + archdir=$(OBJROOT)/$(KERNEL_CONFIG)_$${onearch}/$(COMPONENT); \ + if [ -e $${archdir}/kmod/$(*F).a ]; then \ + allarchs="$${allarchs} $${archdir}/kmod/$(*F).a"; \ + fi; \ + done; \ + cmd="$(LIPO) $${allarchs} -create -output $(SYMROOT)/$(*F).a"; \ + echo $$cmd; eval $$cmd; \ + else \ + my_counter=1; \ + my_innercounter=1; \ + outputfile=$(SYMROOT)/$(*F).a; \ + for my_config in $(TARGET_CONFIGS_UC); do \ + if [ $${my_counter} -eq 1 ]; then \ + my_counter=2; \ + my_kconfig=$${my_config}; \ + elif [ $${my_counter} -eq 2 ]; then \ + my_counter=3; \ + my_aconfig=$${my_config}; \ + else \ + my_counter=1; \ + inputfile=$(OBJROOT)/$${my_kconfig}_$${my_aconfig}_$${my_config}/$(COMPONENT)/kmod/$(*F).a; \ + if [ -e $${inputfile} ]; then \ + if [ $${my_innercounter} -eq 1 ]; then \ + my_innercounter=2; \ + cmd="$(LIPO) -create $${inputfile} -o $${outputfile}"; \ + else \ + cmd="$(LIPO) -create $${outputfile} $${inputfile} -o $${outputfile} || true"; \ + fi; \ + echo $$cmd; eval $$cmd; \ + fi; \ + fi; \ + done; \ + fi; \ + cmd="$(INSTALL) $(LIB_INSTALL_FLAGS) $(SYMROOT)/$(*F).a $@"; \ echo $$cmd; eval $$cmd - do_build_install: $(INSTALL_DIR)/$(KMOD_NAME).a $(INSTALL_DIR)/$(KMODCPP_NAME).a # include $(MakeInc_rule) diff --git a/libsa/catalogue.cpp b/libsa/catalogue.cpp index ab7ce249a..6b650c0bd 100644 --- a/libsa/catalogue.cpp +++ b/libsa/catalogue.cpp @@ -502,30 +502,6 @@ OSDictionary * compareExtensionVersions( goto finish; } - if (0 == strcmp("com.apple.driver.AppleIntelCPUPowerManagement", - incumbentName->getCStringNoCopy())) { - /* Special rules. Always favor version 51.0.0 exactly at the - * expense of all other versions newer or older. - */ - if(0 == strcmp(incumbentVersionString->getCStringNoCopy(), "51.0.0")) { - IOLog(VTYELLOW "Skipping duplicate extension \"%s\" with " - " version (%s -> %s).\n" VTRESET, - candidateName->getCStringNoCopy(), - candidateVersionString->getCStringNoCopy(), - incumbentVersionString->getCStringNoCopy()); - winner = incumbent; - goto finish; - } else if (0 == strcmp(candidateVersionString->getCStringNoCopy(), "51.0.0")) { - IOLog(VTYELLOW "Skipping duplicate extension \"%s\" with " - " version (%s -> %s).\n" VTRESET, - candidateName->getCStringNoCopy(), - incumbentVersionString->getCStringNoCopy(), - candidateVersionString->getCStringNoCopy()); - winner = candidate; - goto finish; - } - } - if (candidate_vers > incumbent_vers) { IOLog(VTYELLOW "Replacing extension \"%s\" with newer version " "(%s -> %s).\n" VTRESET, diff --git a/libsyscall/BSDmakefile b/libsyscall/BSDmakefile index 699706fb6..8a6ff2a3b 100644 --- a/libsyscall/BSDmakefile +++ b/libsyscall/BSDmakefile @@ -16,7 +16,7 @@ RC_ARCHS = $(ARCH) RC_$(RC_ARCHS) = 1 .endif NARCHS != echo $(RC_ARCHS) | wc -w -LIBSYS = $(NEXT_ROOT)/usr/local/lib/system +LIBSYS = $(SDKROOT)/usr/local/lib/system NJOBS != perl -e '$$n = `/usr/sbin/sysctl -n hw.ncpu`; printf "%d\n", $$n < 2 ? 2 : ($$n * 1.5)' BSDMAKE = bsdmake -f Makefile BSDMAKEJ = $(BSDMAKE) -j $(NJOBS) diff --git a/libsyscall/Makefile b/libsyscall/Makefile index ab642795b..a40b4fb5e 100644 --- a/libsyscall/Makefile +++ b/libsyscall/Makefile @@ -22,12 +22,16 @@ CC = gcc .ifdef ALTFRAMEWORKSPATH PRIVINC = -F${ALTFRAMEWORKSPATH} -I${ALTFRAMEWORKSPATH}/System.framework/PrivateHeaders .else -PRIVINC = -I${NEXT_ROOT}/System/Library/Frameworks/System.framework/PrivateHeaders +PRIVINC = -I${SDKROOT}/System/Library/Frameworks/System.framework/PrivateHeaders .endif CFLAGS += ${PRIVINC} -CFLAGS += -no-cpp-precomp -force_cpusubtype_ALL +.if empty $(MACHINE_ARCH:Marm*) +CFLAGS += -force_cpusubtype_ALL +AINC= -force_cpusubtype_ALL +.endif +CFLAGS += -no-cpp-precomp CFLAGS += -fno-common -pipe -Wmost -g -AINC= -no-cpp-precomp -force_cpusubtype_ALL +AINC+= -no-cpp-precomp AINC+= -arch ${MACHINE_ARCH} -g CLEANFILES+=tags INSTALL_PIC_ARCHIVE= yes @@ -43,7 +47,7 @@ MAKEOBJDIR ?= ${OBJROOT} # add version string SRCS += libsyscall_version.c libsyscall_version.c: - ${NEXT_ROOT}/Developer/Makefiles/bin/version.pl Libsyscall > $@ + ${SDKROOT}/Developer/Makefiles/bin/version.pl Libsyscall > $@ CFLAGS += -I${SYMROOT} .include "${.CURDIR}/Makefile.inc" diff --git a/libsyscall/Makefile.xbs b/libsyscall/Makefile.xbs index 4b3d8d543..8f6973e6c 100644 --- a/libsyscall/Makefile.xbs +++ b/libsyscall/Makefile.xbs @@ -94,11 +94,7 @@ PRIVHDRSPPC = ${PRIVHDRS}/architecture/ppc KERNELFRAMEWORK = ${DESTDIR}/System/Library/Frameworks/Kernel.framework PRIVKERNELHDRS = ${KERNELFRAMEWORK}/Versions/A/PrivateHeaders -.if ${MACHINE_ARCH} == armv6 -ARCHDIR = arm -.else -ARCHDIR = ${MACHINE_ARCH} -.endif +ARCHDIR = ${MACHINE_ARCH:C/^armv.*$/arm/} installhdrs-md: gen_md_mig_defs mkdir -p ${INCDIR}/mach/${ARCHDIR} diff --git a/libsyscall/mach/Makefile.inc b/libsyscall/mach/Makefile.inc index 40048e71e..516300d2a 100644 --- a/libsyscall/mach/Makefile.inc +++ b/libsyscall/mach/Makefile.inc @@ -1,9 +1,5 @@ # machine-dependent mach sources -.if ${MACHINE_ARCH} == armv6 -ARCHDIR = arm -.else -ARCHDIR = ${MACHINE_ARCH} -.endif +ARCHDIR = ${MACHINE_ARCH:C/^armv.*$/arm/} .if exists(${.CURDIR}/mach/${ARCHDIR}/Makefile.inc) .include "${.CURDIR}/mach/${ARCHDIR}/Makefile.inc" .endif diff --git a/makedefs/MakeInc.def b/makedefs/MakeInc.def index fab9fa524..c4b6c21dc 100644 --- a/makedefs/MakeInc.def +++ b/makedefs/MakeInc.def @@ -197,6 +197,10 @@ ARCH_FLAGS_PPC = -arch ppc ARCH_FLAGS_I386 = -arch i386 ARCH_FLAGS_ARM = $($(addsuffix $(MACHINE_CONFIG),ARCH_FLAGS_ARM_)) +ARCH_FLAGS_ALL_PPC = $(ARCH_FLAGS_PPC) +ARCH_FLAGS_ALL_I386 = $(ARCH_FLAGS_I386) +ARCH_FLAGS_ALL_ARM = -arch arm + # # Default CFLAGS @@ -215,35 +219,36 @@ export CFLAGS_GEN = -static $(DEBUG_CFLAGS) -nostdinc -nostdlib \ -fno-builtin -finline -msoft-float \ -fsigned-bitfields $(OTHER_CFLAGS) +ifeq ($(BUILD_STABS),1) +export CFLAGS_GEN += -gstabs+ +export BUILD_DWARF = 0 +export BUILD_STABS = 1 +else +export CFLAGS_GEN += -gdwarf-2 +export BUILD_DWARF = 1 +export BUILD_STABS = 0 +endif + export CFLAGS_RELEASE = export CFLAGS_DEVELOPMENT = export CFLAGS_DEBUG = export CFLAGS_PROFILE = -pg -ifeq ($(BUILD_STABS),1) -export CFLAGS_PPC = -Dppc -DPPC -D__PPC__ -DPAGE_SIZE_FIXED \ - -mno-altivec -gstabs+ -force_cpusubtype_ALL -export CFLAGS_I386 = -Di386 -DI386 -D__I386__ \ - -DPAGE_SIZE_FIXED -gstabs+ -force_cpusubtype_ALL -export CFLAGS_ARM = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \ - -fno-strict-aliasing -gstabs+ -fno-keep-inline-functions -export BUILD_DWARF = 0 -export BUILD_STABS = 1 -else export CFLAGS_PPC = -Dppc -DPPC -D__PPC__ -DPAGE_SIZE_FIXED \ - -mno-altivec -gdwarf-2 -force_cpusubtype_ALL + -mno-altivec -force_cpusubtype_ALL export CFLAGS_I386 = -Di386 -DI386 -D__I386__ \ - -DPAGE_SIZE_FIXED -gdwarf-2 -force_cpusubtype_ALL + -DPAGE_SIZE_FIXED -force_cpusubtype_ALL export CFLAGS_ARM = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \ - -fno-strict-aliasing -gdwarf-2 -fno-keep-inline-functions -export BUILD_DWARF = 1 -export BUILD_STABS = 0 + -fno-strict-aliasing -fno-keep-inline-functions + +ifeq (-arch armv7,$(ARCH_FLAGS_ARM)) +CFLAGS_ARM += -mthumb endif ifeq (-arch armv6,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb endif ifeq (-arch armv5,$(ARCH_FLAGS_ARM)) -CFLAGS_ARM += -mthumb +#CFLAGS_ARM += -mthumb # endif ifeq (-arch xscale,$(ARCH_FLAGS_ARM)) CFLAGS_ARM += -mthumb @@ -327,7 +332,7 @@ export LDFLAGS_COMPONENT_PROFILE = $(COMP_LDFLAGS_COMPONENT_PROFILE) export LDFLAGS_COMPONENT_PPC = $(COMP_LDFLAGS_COMPONENT_PPC) -force_cpusubtype_ALL export LDFLAGS_COMPONENT_I386 = $(COMP_LDFLAGS_COMPONENT_i386) -export LDFLAGS_COMPONENT_ARM = $(COMP_LDFLAGS_COMPONENT_ARM) +export LDFLAGS_COMPONENT_ARM = $(COMP_LDFLAGS_COMPONENT_ARM) -Wl,-new_linker export LDFLAGS_COMPONENT = $(LDFLAGS_COMPONENT_GEN) \ $($(addsuffix $(ARCH_CONFIG),ARCH_FLAGS_)) \ @@ -364,6 +369,7 @@ export LDFLAGS_KERNEL_I386 = \ -Wl,-segaddr,__TEXT,0x111000 export LDFLAGS_KERNEL_ARM = \ + -Wl,-new_linker \ -Wl,-segaddr,__HIB,0xC0000000 \ -Wl,-segaddr,__TEXT,0xC0008000 diff --git a/makedefs/MakeInc.rule b/makedefs/MakeInc.rule index 9e62069ae..c2f11dbb5 100644 --- a/makedefs/MakeInc.rule +++ b/makedefs/MakeInc.rule @@ -625,7 +625,7 @@ $(OBJPATH)/kgmacros: $(SRCROOT)/kgmacros $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel force_file_install @echo Installing $< in $@; - @if [ ! -e $(DSTROOT)$(INSTALL_FILE_DIR) ]; then \ + $(_v)if [ ! -e $(DSTROOT)$(INSTALL_FILE_DIR) ]; then \ $(MKDIR) $(DSTROOT)$(INSTALL_FILE_DIR); \ fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ @@ -636,14 +636,38 @@ $(DSTROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TA echo >empty_file_$(notdir $@); \ lipo_arg="$(subst _empty_file, empty_file_$(notdir $@),$(foreach lipo_arch,$(INSTALL_ARCHS_LC), $(addprefix -arch , $(addsuffix _empty_file, $(lipo_arch)))))"; \ $(LIPO) $${lipo_arg} -create -output $@; \ - $(RM) $(RMFLAGS) empty_file_$(notdir $@); \ + $(RM) $(RMFLAGS) empty_file_$(notdir $@); \ fi; \ $(LIPO) $@ -replace $(ARCH_CONFIG_LC) $< -o $@; \ + fi; \ + if [ $(BUILD_DWARF) -eq 1 ]; then \ + if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ + $(CP) -f $< $<.ctfsys; \ + $(FIND) $(OBJPATH)/ -name \*.ctf -size 0 \ + -exec $(RM) -rf {} \; ; \ + $(CTFMERGE) -l xnu -o $<.ctfsys \ + $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $@.ctfsys; \ + else \ + if [ ! -e $@.ctfsys ]; then \ + echo >empty_file_$(notdir $@); \ + lipo_arg="$(subst _empty_file, empty_file_$(notdir $@),$(foreach lipo_arch,$(INSTALL_ARCHS_LC), $(addprefix -arch , $(addsuffix _empty_file, $(lipo_arch)))))"; \ + $(LIPO) $${lipo_arg} -create -output $@.ctfsys;\ + $(RM) $(RMFLAGS) empty_file_$(notdir $@);\ + fi; \ + $(FIND) $(OBJPATH)/ -name \*.ctf -size 0 \ + -exec $(RM) -rf {} \; ; \ + $(CP) -f $< $<.ctfsys; \ + $(CTFMERGE) -l xnu -o $<.ctfsys \ + $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ + $(LIPO) $@.ctfsys -replace $(ARCH_CONFIG_LC) \ + $<.ctfsys -o $@.ctfsys; \ + fi; \ fi $(SYMROOT)$(INSTALL_FILE_DIR)mach.$(KERNEL_CONFIG_LC).$(MACHINE_CONFIG_LC): $(TARGET)/mach_kernel.sys force_file_install @echo Installing $< in $@; - @if [ ! -e $(SYMROOT)$(INSTALL_FILE_DIR) ]; then \ + $(_v)if [ ! -e $(SYMROOT)$(INSTALL_FILE_DIR) ]; then \ $(MKDIR) $(SYMROOT)$(INSTALL_FILE_DIR); \ fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ @@ -682,7 +706,9 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc fi; \ if [ "`echo $(INSTALL_ARCHS_LC) | wc -w`" -eq 1 ]; then \ $(RM) $(RMFLAGS) $@; \ - $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ + if [ $(MACHINE_CONFIG) = DEFAULT ]; then \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $< $@; \ + fi; \ else \ if [ ! -e $@ ]; then \ echo >empty_file_$(notdir $@); \ @@ -699,7 +725,9 @@ $(INSTALL_FILE_FILES_GENERIC): $(DSTROOT)$(INSTALL_FILE_DIR)% : $(TARGET)/% forc -exec $(RM) -rf {} \; ; \ $(CTFMERGE) -l xnu -o $<.ctfsys \ $(OBJPATH)/*/$(KERNEL_CONFIG)/*.*o.ctf || true; \ - $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $(dir $@); \ + if [ $(MACHINE_CONFIG) = DEFAULT ]; then \ + $(INSTALL) $(FILE_INSTALL_FLAGS) $<.ctfsys $(dir $@); \ + fi; \ else \ if [ ! -e $@.ctfsys ]; then \ echo >empty_file_$(notdir $@); \ diff --git a/osfmk/conf/files.i386 b/osfmk/conf/files.i386 index a41da57da..2864ea6b9 100644 --- a/osfmk/conf/files.i386 +++ b/osfmk/conf/files.i386 @@ -85,9 +85,13 @@ osfmk/i386/commpage/bcopy_scalar.s standard osfmk/i386/commpage/bcopy_sse2.s standard osfmk/i386/commpage/bcopy_sse3x.s standard osfmk/i386/commpage/bcopy_sse3x_64.s standard +osfmk/i386/commpage/bcopy_sse42.s standard +osfmk/i386/commpage/bcopy_sse42_64.s standard osfmk/i386/commpage/bzero_scalar.s standard osfmk/i386/commpage/bzero_sse2.s standard osfmk/i386/commpage/bzero_sse2_64.s standard +osfmk/i386/commpage/bzero_sse42.s standard +osfmk/i386/commpage/bzero_sse42_64.s standard osfmk/i386/commpage/memset_pattern_sse2.s standard osfmk/i386/commpage/memset_pattern_sse2_64.s standard osfmk/i386/commpage/longcopy_sse3x.s standard diff --git a/osfmk/i386/AT386/model_dep.c b/osfmk/i386/AT386/model_dep.c index 73ea3948a..b0b961b13 100644 --- a/osfmk/i386/AT386/model_dep.c +++ b/osfmk/i386/AT386/model_dep.c @@ -142,6 +142,8 @@ typedef struct _cframe_t { static unsigned panic_io_port; static unsigned commit_paniclog_to_nvram; +int debug_boot_arg; + void machine_startup(void) { @@ -157,7 +159,8 @@ machine_startup(void) if (boot_arg & DB_PRT) disable_debug_output=FALSE; if (boot_arg & DB_SLOG) systemLogDiags=TRUE; if (boot_arg & DB_NMI) panicDebugging=TRUE; - if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; + if (boot_arg & DB_LOG_PI_SCRN) logPanicDataToScreen=TRUE; + debug_boot_arg = boot_arg; } if (!PE_parse_boot_argn("nvram_paniclog", &commit_paniclog_to_nvram, sizeof (commit_paniclog_to_nvram))) @@ -1052,6 +1055,9 @@ out: kmod_dump(&PC, 1); panic_display_system_configuration(); + panic_display_zprint(); + dump_kext_info(&kdb_log); + /* Release print backtrace lock, to permit other callers in the * event of panics on multiple processors. */ diff --git a/osfmk/i386/commpage/bcopy_sse3x.s b/osfmk/i386/commpage/bcopy_sse3x.s index 418635d9b..8e42ba042 100644 --- a/osfmk/i386/commpage/bcopy_sse3x.s +++ b/osfmk/i386/commpage/bcopy_sse3x.s @@ -802,4 +802,4 @@ LReverseUnalignedLoop: // loop over 64-byte chunks jmp LReverseShort // copy remaining 0..63 bytes and done - COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,0) + COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bcopy_sse3x_64.s b/osfmk/i386/commpage/bcopy_sse3x_64.s index eae8b0ce8..53f4ed76a 100644 --- a/osfmk/i386/commpage/bcopy_sse3x_64.s +++ b/osfmk/i386/commpage/bcopy_sse3x_64.s @@ -146,11 +146,11 @@ LNotShort: // rdi = ptr to 1st dest byte not to move (aligned) LDestAligned: - movl %edx,%ecx // copy length + movq %rdx,%rcx // copy length movl %esi,%eax // copy low half of source address andl $63,%edx // get remaining bytes for LShort andl $15,%eax // mask to low 4 bits of source address - andl $-64,%ecx // get number of bytes we will copy in inner loop + andq $-64,%rcx // get number of bytes we will copy in inner loop // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block. // lea LTable(%rip),%r8 // point to dispatch table movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528 @@ -794,4 +794,4 @@ LReverseUnalignedLoop: // loop over 64-byte chunks jmp LReverseShort // copy remaining 0..63 bytes and done - COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,0) + COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bcopy_sse42.s b/osfmk/i386/commpage/bcopy_sse42.s new file mode 100644 index 000000000..9ddd281ef --- /dev/null +++ b/osfmk/i386/commpage/bcopy_sse42.s @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +/* + * The bcopy/memcpy loops, tuned for Nehalem. + * + * The following #defines are tightly coupled to the u-architecture: + */ + +#define kShort 80 // too short to bother with SSE (must be >=80) + + +// void bcopy(const void *src, void *dst, size_t len); + + .text + .align 5, 0x90 +Lbcopy_sse42: // void bcopy(const void *src, void *dst, size_t len) + pushl %ebp // set up a frame for backtraces + movl %esp,%ebp + pushl %esi + pushl %edi + movl 8(%ebp),%esi // get source ptr + movl 12(%ebp),%edi // get dest ptr + movl 16(%ebp),%ecx // get length + movl %edi,%edx + subl %esi,%edx // (dest - source) + cmpl %ecx,%edx // must move in reverse if (dest - source) < length + jb LReverseIsland + cmpl $(kShort),%ecx // long enough to bother with SSE? + jbe Lshort // no + jmp LNotShort + +// +// void *memcpy(void *dst, const void *src, size_t len); +// void *memmove(void *dst, const void *src, size_t len); +// +// NB: These need to be 32 bytes from bcopy(): +// + + .align 5, 0x90 +Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) +Lmemmove: // void *memmove(void *dst, const void *src, size_t len) + pushl %ebp // set up a frame for backtraces + movl %esp,%ebp + pushl %esi + pushl %edi + movl 8(%ebp),%edi // get dest ptr + movl 12(%ebp),%esi // get source ptr + movl 16(%ebp),%ecx // get length + movl %edi,%edx + subl %esi,%edx // (dest - source) + cmpl %ecx,%edx // must move in reverse if (dest - source) < length + jb LReverseIsland + cmpl $(kShort),%ecx // long enough to bother with SSE? + ja LNotShort // yes + +// Handle short forward copies. As the most common case, this is the fall-through path. +// ecx = length (<= kShort) +// esi = source ptr +// edi = dest ptr + +Lshort: + movl %ecx,%edx // copy length + shrl $2,%ecx // get #doublewords + jz 3f +2: // loop copying doublewords + movl (%esi),%eax + addl $4,%esi + movl %eax,(%edi) + addl $4,%edi + dec %ecx + jnz 2b +3: // handle leftover bytes (0..3) in last word + andl $3,%edx // any leftover bytes? + jz Lexit +4: // loop copying bytes + movb (%esi),%al + inc %esi + movb %al,(%edi) + inc %edi + dec %edx + jnz 4b +Lexit: + movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove + popl %edi + popl %esi + popl %ebp + ret + + +LReverseIsland: // keep the "jb" above a short branch... + jmp LReverse // ...because reverse moves are uncommon + + +// Handle forward moves that are long enough to justify use of SSE. +// First, 16-byte align the destination. +// ecx = length (> kShort) +// esi = source ptr +// edi = dest ptr + +LNotShort: + movl %edi,%edx // copy destination + negl %edx + andl $15,%edx // get #bytes to align destination + jz LDestAligned // already aligned + subl %edx,%ecx // decrement length +1: // loop copying 1..15 bytes + movb (%esi),%al + inc %esi + movb %al,(%edi) + inc %edi + dec %edx + jnz 1b + +// Destination is now aligned. Nehalem does a great job with unaligned SSE loads, +// so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we +// know there is at least one 64-byte chunk to move. +// When we enter the copy loops, the following registers are set up: +// ecx = residual length (0..63) +// edx = -(length to move), a multiple of 64 +// esi = ptr to 1st source byte not to move (unaligned) +// edi = ptr to 1st dest byte not to move (aligned) + +LDestAligned: + movl %ecx,%edx // copy length + andl $63,%ecx // get remaining bytes for Lshort + andl $-64,%edx // get number of bytes we will copy in inner loop + addl %edx,%esi // point to 1st byte not copied + addl %edx,%edi + negl %edx // now generate offset to 1st byte to be copied + testl $15,%esi // source also aligned? + jnz LUnalignedLoop + jmp LAlignedLoop + + +// Forward loop for aligned operands. + + .align 4,0x90 // 16-byte align inner loops +LAlignedLoop: // loop over 64-byte chunks + movdqa (%esi,%edx),%xmm0 + movdqa 16(%esi,%edx),%xmm1 + movdqa 32(%esi,%edx),%xmm2 + movdqa 48(%esi,%edx),%xmm3 + + movdqa %xmm0,(%edi,%edx) + movdqa %xmm1,16(%edi,%edx) + movdqa %xmm2,32(%edi,%edx) + movdqa %xmm3,48(%edi,%edx) + + addl $64,%edx + jnz LAlignedLoop + + jmp Lshort // copy remaining 0..63 bytes and done + + +// Forward loop for unaligned operands. + + .align 4,0x90 // 16-byte align inner loops +LUnalignedLoop: // loop over 64-byte chunks + movdqu (%esi,%edx),%xmm0 + movdqu 16(%esi,%edx),%xmm1 + movdqu 32(%esi,%edx),%xmm2 + movdqu 48(%esi,%edx),%xmm3 + + movdqa %xmm0,(%edi,%edx) + movdqa %xmm1,16(%edi,%edx) + movdqa %xmm2,32(%edi,%edx) + movdqa %xmm3,48(%edi,%edx) + + addl $64,%edx + jnz LUnalignedLoop + + jmp Lshort // copy remaining 0..63 bytes and done + + +// Reverse moves. They are only used with destructive overlap. +// ecx = length +// esi = source ptr +// edi = dest ptr + +LReverse: + addl %ecx,%esi // point to end of strings + addl %ecx,%edi + cmpl $(kShort),%ecx // long enough to bother with SSE? + ja LReverseNotShort // yes + +// Handle reverse short copies. +// ecx = length +// esi = one byte past end of source +// edi = one byte past end of dest + +LReverseShort: + movl %ecx,%edx // copy length + shrl $2,%ecx // #words + jz 3f +1: + subl $4,%esi + movl (%esi),%eax + subl $4,%edi + movl %eax,(%edi) + dec %ecx + jnz 1b +3: + andl $3,%edx // bytes? + jz 5f +4: + dec %esi + movb (%esi),%al + dec %edi + movb %al,(%edi) + dec %edx + jnz 4b +5: + movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove + popl %edi + popl %esi + popl %ebp + ret + +// Handle a reverse move long enough to justify using SSE. +// ecx = length +// esi = one byte past end of source +// edi = one byte past end of dest + +LReverseNotShort: + movl %edi,%edx // copy destination + andl $15,%edx // get #bytes to align destination + je LReverseDestAligned // already aligned + subl %edx,%ecx // adjust length +1: // loop copying 1..15 bytes + dec %esi + movb (%esi),%al + dec %edi + movb %al,(%edi) + dec %edx + jnz 1b + +// Destination is now aligned. Prepare for reverse loops. + +LReverseDestAligned: + movl %ecx,%edx // copy length + andl $63,%ecx // get remaining bytes for Lshort + andl $-64,%edx // get number of bytes we will copy in inner loop + subl %edx,%esi // point to endpoint of copy + subl %edx,%edi + testl $15,%esi // is source aligned too? + jnz LReverseUnalignedLoop // no + +LReverseAlignedLoop: // loop over 64-byte chunks + movdqa -16(%esi,%edx),%xmm0 + movdqa -32(%esi,%edx),%xmm1 + movdqa -48(%esi,%edx),%xmm2 + movdqa -64(%esi,%edx),%xmm3 + + movdqa %xmm0,-16(%edi,%edx) + movdqa %xmm1,-32(%edi,%edx) + movdqa %xmm2,-48(%edi,%edx) + movdqa %xmm3,-64(%edi,%edx) + + subl $64,%edx + jne LReverseAlignedLoop + + jmp LReverseShort // copy remaining 0..63 bytes and done + + +// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. + +LReverseUnalignedLoop: // loop over 64-byte chunks + movdqu -16(%esi,%edx),%xmm0 + movdqu -32(%esi,%edx),%xmm1 + movdqu -48(%esi,%edx),%xmm2 + movdqu -64(%esi,%edx),%xmm3 + + movdqa %xmm0,-16(%edi,%edx) + movdqa %xmm1,-32(%edi,%edx) + movdqa %xmm2,-48(%edi,%edx) + movdqa %xmm3,-64(%edi,%edx) + + subl $64,%edx + jne LReverseUnalignedLoop + + jmp LReverseShort // copy remaining 0..63 bytes and done + + + COMMPAGE_DESCRIPTOR(bcopy_sse42,_COMM_PAGE_BCOPY,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/bcopy_sse42_64.s b/osfmk/i386/commpage/bcopy_sse42_64.s new file mode 100644 index 000000000..7de012622 --- /dev/null +++ b/osfmk/i386/commpage/bcopy_sse42_64.s @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +/* + * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version. + * + * The following #defines are tightly coupled to the u-architecture: + */ + +#define kShort 80 // too short to bother with SSE (must be >=80) + + +// void bcopy(const void *src, void *dst, size_t len); + + .text + .code64 + .align 5, 0x90 +Lbcopy_sse42_64: // void bcopy(const void *src, void *dst, size_t len) + pushq %rbp // set up a frame for backtraces + movq %rsp,%rbp + movq %rsi,%rax // copy dest ptr + movq %rdi,%rsi // xchange source and dest ptrs + movq %rax,%rdi + subq %rsi,%rax // (dest - source) + cmpq %rdx,%rax // must move in reverse if (dest - source) < length + jb LReverseIsland + cmpq $(kShort),%rdx // long enough to bother with SSE? + jbe LShort // no + jmp LNotShort + +// +// void *memcpy(void *dst, const void *src, size_t len); +// void *memmove(void *dst, const void *src, size_t len); +// +// NB: These need to be 32 bytes from bcopy(): +// + + .align 5, 0x90 +Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) +Lmemmove: // void *memmove(void *dst, const void *src, size_t len) + pushq %rbp // set up a frame for backtraces + movq %rsp,%rbp + movq %rdi,%r11 // save return value here + movq %rdi,%rax + subq %rsi,%rax // (dest - source) + cmpq %rdx,%rax // must move in reverse if (dest - source) < length + jb LReverseIsland + cmpq $(kShort),%rdx // long enough to bother with SSE? + ja LNotShort // yes + +// Handle short forward copies. As the most common case, this is the fall-through path. +// rdx = length (<= kShort) +// rsi = source ptr +// rdi = dest ptr + +LShort: + movl %edx,%ecx // copy length using 32-bit operation + shrl $2,%ecx // get #doublewords + jz 3f +2: // loop copying doublewords + movl (%rsi),%eax + addq $4,%rsi + movl %eax,(%rdi) + addq $4,%rdi + decl %ecx + jnz 2b +3: // handle leftover bytes (0..3) in last word + andl $3,%edx // any leftover bytes? + jz 5f +4: // loop copying bytes + movb (%rsi),%al + incq %rsi + movb %al,(%rdi) + incq %rdi + decl %edx + jnz 4b +5: + movq %r11,%rax // get return value (dst ptr) for memcpy/memmove + popq %rbp + ret + + +LReverseIsland: // keep the "jb" above a short branch... + jmp LReverse // ...because reverse moves are uncommon + + +// Handle forward moves that are long enough to justify use of SSE. +// First, 16-byte align the destination. +// rdx = length (> kShort) +// rsi = source ptr +// rdi = dest ptr + +LNotShort: + movl %edi,%ecx // copy low half of destination ptr + negl %ecx + andl $15,%ecx // get #bytes to align destination + jz LDestAligned // already aligned + subl %ecx,%edx // decrement length +1: // loop copying 1..15 bytes + movb (%rsi),%al + inc %rsi + movb %al,(%rdi) + inc %rdi + dec %ecx + jnz 1b + + +// Destination is now aligned. Nehalem does a great job with unaligned SSE loads, +// so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we +// know there is at least one 64-byte chunk to move. +// When we enter the copy loops, the following registers are set up: +// rdx = residual length (0..63) +// rcx = -(length to move), a multiple of 64 less than 2GB +// rsi = ptr to 1st source byte not to move (unaligned) +// rdi = ptr to 1st dest byte not to move (aligned) + +LDestAligned: + movq %rdx,%rcx // copy length + andl $63,%edx // get remaining bytes for LShort + andq $-64,%rcx // get number of bytes we will copy in inner loop + addq %rcx,%rsi // point to 1st byte not copied + addq %rcx,%rdi + negq %rcx // now generate offset to 1st byte to be copied + testl $15,%esi // source also aligned? + jnz LUnalignedLoop + jmp LAlignedLoop + + +// Forward loop for aligned operands. + + .align 4,0x90 // 16-byte align inner loops +LAlignedLoop: // loop over 64-byte chunks + movdqa (%rsi,%rcx),%xmm0 + movdqa 16(%rsi,%rcx),%xmm1 + movdqa 32(%rsi,%rcx),%xmm2 + movdqa 48(%rsi,%rcx),%xmm3 + + movdqa %xmm0,(%rdi,%rcx) + movdqa %xmm1,16(%rdi,%rcx) + movdqa %xmm2,32(%rdi,%rcx) + movdqa %xmm3,48(%rdi,%rcx) + + addq $64,%rcx + jnz LAlignedLoop + + jmp LShort // copy remaining 0..63 bytes and done + + +// Forward loop for unaligned operands. + + .align 4,0x90 // 16-byte align inner loops +LUnalignedLoop: // loop over 64-byte chunks + movdqu (%rsi,%rcx),%xmm0 + movdqu 16(%rsi,%rcx),%xmm1 + movdqu 32(%rsi,%rcx),%xmm2 + movdqu 48(%rsi,%rcx),%xmm3 + + movdqa %xmm0,(%rdi,%rcx) + movdqa %xmm1,16(%rdi,%rcx) + movdqa %xmm2,32(%rdi,%rcx) + movdqa %xmm3,48(%rdi,%rcx) + + addq $64,%rcx + jnz LUnalignedLoop + + jmp LShort // copy remaining 0..63 bytes and done + + +// Reverse moves. These are only used with destructive overlap. +// rdx = length +// rsi = source ptr +// rdi = dest ptr + +LReverse: + addq %rdx,%rsi // point to end of strings + addq %rdx,%rdi + cmpq $(kShort),%rdx // long enough to bother with SSE? + ja LReverseNotShort // yes + +// Handle reverse short copies. +// edx = length (<= kShort) +// rsi = one byte past end of source +// rdi = one byte past end of dest + +LReverseShort: + movl %edx,%ecx // copy length + shrl $3,%ecx // #quadwords + jz 3f +1: + subq $8,%rsi + movq (%rsi),%rax + subq $8,%rdi + movq %rax,(%rdi) + decl %ecx + jnz 1b +3: + andl $7,%edx // bytes? + jz 5f +4: + decq %rsi + movb (%rsi),%al + decq %rdi + movb %al,(%rdi) + decl %edx + jnz 4b +5: + movq %r11,%rax // get return value (dst ptr) for memcpy/memmove + popq %rbp + ret + +// Handle a reverse move long enough to justify using SSE. +// rdx = length (> kShort) +// rsi = one byte past end of source +// rdi = one byte past end of dest + +LReverseNotShort: + movl %edi,%ecx // copy destination + andl $15,%ecx // get #bytes to align destination + jz LReverseDestAligned // already aligned + subq %rcx,%rdx // adjust length +1: // loop copying 1..15 bytes + decq %rsi + movb (%rsi),%al + decq %rdi + movb %al,(%rdi) + decl %ecx + jnz 1b + +// Destination is now aligned. Prepare for reverse loops. + +LReverseDestAligned: + movq %rdx,%rcx // copy length + andl $63,%edx // get remaining bytes for LReverseShort + andq $-64,%rcx // get number of bytes we will copy in inner loop + subq %rcx,%rsi // point to endpoint of copy + subq %rcx,%rdi + testl $15,%esi // is source aligned too? + jnz LReverseUnalignedLoop // no + +LReverseAlignedLoop: // loop over 64-byte chunks + movdqa -16(%rsi,%rcx),%xmm0 + movdqa -32(%rsi,%rcx),%xmm1 + movdqa -48(%rsi,%rcx),%xmm2 + movdqa -64(%rsi,%rcx),%xmm3 + + movdqa %xmm0,-16(%rdi,%rcx) + movdqa %xmm1,-32(%rdi,%rcx) + movdqa %xmm2,-48(%rdi,%rcx) + movdqa %xmm3,-64(%rdi,%rcx) + + subq $64,%rcx + jne LReverseAlignedLoop + + jmp LReverseShort // copy remaining 0..63 bytes and done + + +// Reverse, unaligned loop. LDDQU==MOVDQU on these machines. + +LReverseUnalignedLoop: // loop over 64-byte chunks + movdqu -16(%rsi,%rcx),%xmm0 + movdqu -32(%rsi,%rcx),%xmm1 + movdqu -48(%rsi,%rcx),%xmm2 + movdqu -64(%rsi,%rcx),%xmm3 + + movdqa %xmm0,-16(%rdi,%rcx) + movdqa %xmm1,-32(%rdi,%rcx) + movdqa %xmm2,-48(%rdi,%rcx) + movdqa %xmm3,-64(%rdi,%rcx) + + subq $64,%rcx + jne LReverseUnalignedLoop + + jmp LReverseShort // copy remaining 0..63 bytes and done + + + COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/bzero_sse2.s b/osfmk/i386/commpage/bzero_sse2.s index 49c94750d..a80418bd9 100644 --- a/osfmk/i386/commpage/bzero_sse2.s +++ b/osfmk/i386/commpage/bzero_sse2.s @@ -161,4 +161,4 @@ LVeryLong: jmp Lshort - COMMPAGE_DESCRIPTOR(bzero_sse2,_COMM_PAGE_BZERO,kHasSSE2,0) + COMMPAGE_DESCRIPTOR(bzero_sse2,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bzero_sse2_64.s b/osfmk/i386/commpage/bzero_sse2_64.s index d82d77e6f..ef494cbbf 100644 --- a/osfmk/i386/commpage/bzero_sse2_64.s +++ b/osfmk/i386/commpage/bzero_sse2_64.s @@ -161,4 +161,4 @@ LVeryLong: jmp Lshort - COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,0) + COMMPAGE_DESCRIPTOR(bzero_sse2_64,_COMM_PAGE_BZERO,kHasSSE2,kHasSSE4_2) diff --git a/osfmk/i386/commpage/bzero_sse42.s b/osfmk/i386/commpage/bzero_sse42.s new file mode 100644 index 000000000..8db6b07a9 --- /dev/null +++ b/osfmk/i386/commpage/bzero_sse42.s @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +/* + * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. + * We don't actually use SSE4.2, but rather use it to identify Nehalem. + * + * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. + * + * This routine is also used for memset(p,0,n), which is a common case + * since gcc sometimes silently maps bzero() into memset(). As a result, + * we always load the original ptr into %eax before returning. + */ + +#define kShort 80 // too short to bother with SSE (must be >=80) + + + .text + .align 5, 0x90 +Lbzero_sse42: // void bzero(void *b, size_t len); + pushl %ebp // set up a frame for backtraces + movl %esp,%ebp + pushl %edi + movl 8(%ebp),%edi // get ptr + movl 12(%ebp),%edx // get length + + xorl %eax,%eax // set fill data to 0 + cmpl $(kShort),%edx // long enough for SSE? + jg LNotShort // yes + +// Here for short operands or the end of long ones. +// %edx = length +// %edi = ptr +// %eax = zero + +Lshort: + cmpl $12,%edx // long enough to word align? + jge 3f // yes + test %edx,%edx // length==0? + jz 6f +1: + movb %al,(%edi) // zero a byte + inc %edi + dec %edx + jnz 1b + jmp 6f +2: + movb %al,(%edi) // zero a byte + inc %edi + dec %edx +3: + test $3,%edi // is ptr doubleword aligned? + jnz 2b // no + movl %edx,%ecx // copy length + shrl $2,%edx // #doublewords to store +4: + movl %eax,(%edi) // zero an aligned doubleword + addl $4,%edi + dec %edx + jnz 4b + andl $3,%ecx // mask down to #bytes at end (0..3) + jz 6f // none +5: + movb %al,(%edi) // zero a byte + inc %edi + dec %ecx + jnz 5b +6: + movl 8(%ebp),%eax // get return value in case this was a call of memset() + popl %edi + popl %ebp + ret + + +// We will be using SSE, so align ptr. +// %edx = length +// %edi = ptr +// %eax = zero + +LNotShort: + testl $3,%edi // 4-byte aligned? + jz 2f // yes + movb %al,(%edi) // zero another byte + incl %edi + decl %edx + jmp LNotShort +1: // zero doublewords until 16-byte aligned + movl %eax,(%edi) + addl $4,%edi + subl $4,%edx +2: + testl $15,%edi // 16-byte aligned? + jnz 1b // no + + +// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. +// %edx = length +// %edi = ptr +// %eax = zero + +LDestAligned: + movl %edx,%ecx + andl $63,%edx // mask down to residual length (0..63) + andl $-64,%ecx // get #bytes we will zero in this loop + pxor %xmm0,%xmm0 // zero an SSE register + addl %ecx,%edi // increment ptr by length to move + negl %ecx // negate length to move + jmp 1f + +// Loop over 64-byte chunks, storing into cache. + + .align 4,0x90 // keep inner loops 16-byte aligned +1: + movdqa %xmm0,(%edi,%ecx) + movdqa %xmm0,16(%edi,%ecx) + movdqa %xmm0,32(%edi,%ecx) + movdqa %xmm0,48(%edi,%ecx) + addl $64,%ecx + jne 1b + + jmp Lshort + + + + COMMPAGE_DESCRIPTOR(bzero_sse42,_COMM_PAGE_BZERO,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/bzero_sse42_64.s b/osfmk/i386/commpage/bzero_sse42_64.s new file mode 100644 index 000000000..5f869398c --- /dev/null +++ b/osfmk/i386/commpage/bzero_sse42_64.s @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include + +/* + * Bzero, tuned for processors with SSE4.2 and 64-byte cache lines, ie Nehalem. + * We don't actually use SSE4.2, but rather use it to identify Nehalem. + * This is the 64-bit version. + * + * We do not use nontemporal operations, but use MOVDQA in preference to REP/STOS. + * + * This routine is also used for memset(p,0,n), which is a common case + * since gcc sometimes silently maps bzero() into memset(). As a result, + * we always load the original ptr into %eax before returning. + */ + +#define kShort 80 // too short to bother with SSE (must be >=80) + + + .text + .code64 + .align 5, 0x90 +Lbzero_sse42_64: // void bzero(void *b, size_t len); + pushq %rbp // set up a frame for backtraces + movq %rsp,%rbp + xorl %eax,%eax // set fill data to 0 + movq %rdi,%r11 // save original ptr as return value + cmpq $(kShort),%rsi // long enough for SSE? + jg LNotShort // yes + +// Here for short operands or the end of long ones. +// %esi = length (<= kShort) +// %rdi = ptr +// %eax = zero + +Lshort: + cmpl $12,%esi // long enough to word align? + jge 3f // yes + test %esi,%esi // length==0? + jz 6f +1: + movb %al,(%rdi) // zero a byte + incq %rdi + decl %esi + jnz 1b + jmp 6f +2: + movb %al,(%rdi) // zero a byte + incq %rdi + decl %esi +3: + testl $3,%edi // is ptr doubleword aligned? + jnz 2b // no + movl %esi,%ecx // copy length + shrl $2,%esi // #doublewords to store +4: + movl %eax,(%rdi) // zero an aligned doubleword + addq $4,%rdi + decl %esi + jnz 4b + andl $3,%ecx // mask down to #bytes at end (0..3) + jz 6f // none +5: + movb %al,(%rdi) // zero a byte + incq %rdi + decl %ecx + jnz 5b +6: + movq %r11,%rax // set return value in case this was a call of memset() + popq %rbp + ret + + +// We will be using SSE, so align ptr. +// %rsi = length (> kShort) +// %rdi = ptr +// %eax = zero + +LNotShort: + testl $3,%edi // 4-byte aligned? + jz 2f // yes + movb %al,(%rdi) // zero another byte + incq %rdi + decq %rsi + jmp LNotShort +1: // zero doublewords until 16-byte aligned + movl %eax,(%rdi) + addq $4,%rdi + subq $4,%rsi +2: + testl $15,%edi // 16-byte aligned? + jnz 1b // no + +// Destination is now 16-byte aligned. Prepare to loop over 64-byte chunks. +// %rsi = length (> (kShort-15)) +// %rdi = ptr (aligned) +// %eax = zero + +LDestAligned: + movq %rsi,%rcx + andl $63,%esi // mask down to residual length (0..63) + andq $-64,%rcx // get #bytes we will zero in this loop + pxor %xmm0,%xmm0 // zero an SSE register + addq %rcx,%rdi // increment ptr by length to move + negq %rcx // negate length to move + jmp 1f + +// Loop over 64-byte chunks, storing into cache. + + .align 4,0x90 // keep inner loops 16-byte aligned +1: + movdqa %xmm0,(%rdi,%rcx) + movdqa %xmm0,16(%rdi,%rcx) + movdqa %xmm0,32(%rdi,%rcx) + movdqa %xmm0,48(%rdi,%rcx) + addq $64,%rcx + jne 1b + + jmp Lshort + + + COMMPAGE_DESCRIPTOR(bzero_sse42_64,_COMM_PAGE_BZERO,kHasSSE4_2,0) diff --git a/osfmk/i386/commpage/commpage_asm.s b/osfmk/i386/commpage/commpage_asm.s index e9604430c..6f69fa7b2 100644 --- a/osfmk/i386/commpage/commpage_asm.s +++ b/osfmk/i386/commpage/commpage_asm.s @@ -95,9 +95,11 @@ _commpage_32_routines: .long CPN(bit_test_and_clear_up) .long CPN(bzero_scalar) .long CPN(bzero_sse2) + .long CPN(bzero_sse42) .long CPN(bcopy_scalar) .long CPN(bcopy_sse2) .long CPN(bcopy_sse3x) + .long CPN(bcopy_sse42) .long CPN(memset_pattern_sse2) .long CPN(longcopy_sse3x) .long CPN(nanotime) @@ -138,7 +140,9 @@ _commpage_64_routines: .long CPN(bit_test_and_clear_mp_64) .long CPN(bit_test_and_clear_up_64) .long CPN(bzero_sse2_64) + .long CPN(bzero_sse42_64) .long CPN(bcopy_sse3x_64) + .long CPN(bcopy_sse42_64) .long CPN(memset_pattern_sse2_64) .long CPN(longcopy_sse3x_64) .long CPN(nanotime_64) diff --git a/osfmk/i386/commpage/commpage_mach_absolute_time.s b/osfmk/i386/commpage/commpage_mach_absolute_time.s index 60baed63c..f10baef8b 100644 --- a/osfmk/i386/commpage/commpage_mach_absolute_time.s +++ b/osfmk/i386/commpage/commpage_mach_absolute_time.s @@ -56,6 +56,7 @@ Lnanotime: testl %esi,%esi /* if being updated, loop until stable */ jz 0b + lfence rdtsc /* get TSC in %edx:%eax */ lfence @@ -99,7 +100,9 @@ Lnanotime_slow: testl %esi,%esi /* if generation is 0, data being changed */ jz 0b /* so loop until stable */ + lfence rdtsc /* get TSC in %edx:%eax */ + lfence subl _COMM_PAGE_NT_TSC_BASE,%eax sbbl _COMM_PAGE_NT_TSC_BASE+4,%edx @@ -161,6 +164,7 @@ Lnanotime_64: // NB: must preserve r9, r10, and r11 movl _NT_GENERATION(%rsi),%r8d // get generation testl %r8d,%r8d // if 0, data is being changed... jz 1b // ...so loop until stable + lfence rdtsc // edx:eax := tsc lfence shlq $32,%rdx // rax := ((edx << 32) | eax), ie 64-bit tsc diff --git a/osfmk/i386/cpu_data.h b/osfmk/i386/cpu_data.h index ae05b1767..e41f6b8cd 100644 --- a/osfmk/i386/cpu_data.h +++ b/osfmk/i386/cpu_data.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -61,9 +61,10 @@ struct mca_state; * Data structures embedded in per-cpu data: */ typedef struct rtclock_timer { - uint64_t deadline; - boolean_t is_set; - boolean_t has_expired; + queue_head_t queue; + uint64_t deadline; + boolean_t is_set; + boolean_t has_expired; } rtclock_timer_t; @@ -130,7 +131,6 @@ typedef struct cpu_data int cpu_subtype; int cpu_threadtype; int cpu_running; - uint64_t rtclock_intr_deadline; rtclock_timer_t rtclock_timer; boolean_t cpu_is64bit; task_map_t cpu_task_map; diff --git a/osfmk/i386/cpu_threads.c b/osfmk/i386/cpu_threads.c index 7727eb7ea..6d539ffb1 100644 --- a/osfmk/i386/cpu_threads.c +++ b/osfmk/i386/cpu_threads.c @@ -43,6 +43,8 @@ void debug_topology_print(void); #define DBG(x...) #endif /* TOPO_DEBUG */ +void validate_topology(void); + #define bitmask(h,l) ((bit(h)|(bit(h)-1)) & ~(bit(l)-1)) #define bitfield(x,h,l) (((x) & bitmask(h,l)) >> l) @@ -187,10 +189,6 @@ x86_LLC_info(void) topoParms.nCoresSharingLLC = cpuinfo->core_count; if (nCPUsSharing > cpuinfo->thread_count) topoParms.nLCPUsSharingLLC = cpuinfo->thread_count; - - - if (nCPUsSharing > cpuinfo->thread_count) - topoParms.maxSharingLLC = cpuinfo->thread_count; } static void @@ -1039,6 +1037,180 @@ cpu_thread_halt(void) /* NOT REACHED */ } +/* + * Validates that the topology was built correctly. Must be called only + * after the complete topology is built and no other changes are being made. + */ +void +validate_topology(void) +{ + x86_pkg_t *pkg; + x86_die_t *die; + x86_core_t *core; + x86_lcpu_t *lcpu; + uint32_t nDies; + uint32_t nCores; + uint32_t nCPUs; + + /* + * XXX + * + * Right now this only works if the number of CPUs started is the total + * number of CPUs. However, when specifying cpus=n the topology is only + * partially constructed and the checks below will fail. + * + * We should *always* build the complete topology and only start the CPUs + * indicated by cpus=n. Until that happens, this code will not check the + * topology if the number of cpus defined is < that described the the + * topology parameters. + */ + nCPUs = topoParms.nPackages * topoParms.nLThreadsPerPackage; + if (nCPUs > real_ncpus) + return; + + pkg = x86_pkgs; + while (pkg != NULL) { + /* + * Make sure that the package has the correct number of dies. + */ + nDies = 0; + die = pkg->dies; + while (die != NULL) { + if (die->package == NULL) + panic("Die(%d)->package is NULL", + die->pdie_num); + if (die->package != pkg) + panic("Die %d points to package %d, should be %d", + die->pdie_num, die->package->lpkg_num, pkg->lpkg_num); + + DBG("Die(%d)->package %d\n", + die->pdie_num, pkg->lpkg_num); + + /* + * Make sure that the die has the correct number of cores. + */ + DBG("Die(%d)->cores: "); + nCores = 0; + core = die->cores; + while (core != NULL) { + if (core->die == NULL) + panic("Core(%d)->die is NULL", + core->pcore_num); + if (core->die != die) + panic("Core %d points to die %d, should be %d", + core->pcore_num, core->die->pdie_num, die->pdie_num); + nCores += 1; + DBG("%d ", core->pcore_num); + core = core->next_in_die; + } + DBG("\n"); + + if (nCores != topoParms.nLCoresPerDie) + panic("Should have %d Cores, but only found %d for Die %d", + topoParms.nLCoresPerDie, nCores, die->pdie_num); + + /* + * Make sure that the die has the correct number of CPUs. + */ + DBG("Die(%d)->lcpus: ", die->pdie_num); + nCPUs = 0; + lcpu = die->lcpus; + while (lcpu != NULL) { + if (lcpu->die == NULL) + panic("CPU(%d)->die is NULL", + lcpu->cpu_num); + if (lcpu->die != die) + panic("CPU %d points to die %d, should be %d", + lcpu->cpu_num, lcpu->die->pdie_num, die->pdie_num); + nCPUs += 1; + DBG("%d ", lcpu->cpu_num); + lcpu = lcpu->next_in_die; + } + DBG("\n"); + + if (nCPUs != topoParms.nLThreadsPerDie) + panic("Should have %d Threads, but only found %d for Die %d", + topoParms.nLThreadsPerDie, nCPUs, die->pdie_num); + + nDies += 1; + die = die->next_in_pkg; + } + + if (nDies != topoParms.nLDiesPerPackage) + panic("Should have %d Dies, but only found %d for package %d", + topoParms.nLDiesPerPackage, nDies, pkg->lpkg_num); + + /* + * Make sure that the package has the correct number of cores. + */ + nCores = 0; + core = pkg->cores; + while (core != NULL) { + if (core->package == NULL) + panic("Core(%d)->package is NULL", + core->pcore_num); + if (core->package != pkg) + panic("Core %d points to package %d, should be %d", + core->pcore_num, core->package->lpkg_num, pkg->lpkg_num); + DBG("Core(%d)->package %d\n", + core->pcore_num, pkg->lpkg_num); + + /* + * Make sure that the core has the correct number of CPUs. + */ + nCPUs = 0; + lcpu = core->lcpus; + DBG("Core(%d)->lcpus: "); + while (lcpu != NULL) { + if (lcpu->core == NULL) + panic("CPU(%d)->core is NULL", + lcpu->cpu_num); + if (lcpu->core != core) + panic("CPU %d points to core %d, should be %d", + lcpu->cpu_num, lcpu->core->pcore_num, core->pcore_num); + DBG("%d ", lcpu->cpu_num); + nCPUs += 1; + lcpu = lcpu->next_in_core; + } + DBG("\n"); + + if (nCPUs != topoParms.nLThreadsPerCore) + panic("Should have %d Threads, but only found %d for Core %d", + topoParms.nLThreadsPerCore, nCPUs, core->pcore_num); + nCores += 1; + core = core->next_in_pkg; + } + + if (nCores != topoParms.nLCoresPerPackage) + panic("Should have %d Cores, but only found %d for package %d", + topoParms.nLCoresPerPackage, nCores, pkg->lpkg_num); + + /* + * Make sure that the package has the correct number of CPUs. + */ + nCPUs = 0; + lcpu = pkg->lcpus; + while (lcpu != NULL) { + if (lcpu->package == NULL) + panic("CPU(%d)->package is NULL", + lcpu->cpu_num); + if (lcpu->package != pkg) + panic("CPU %d points to package %d, should be %d", + lcpu->cpu_num, lcpu->package->lpkg_num, pkg->lpkg_num); + DBG("CPU(%d)->package %d\n", + lcpu->cpu_num, pkg->lpkg_num); + nCPUs += 1; + lcpu = lcpu->next_in_pkg; + } + + if (nCPUs != topoParms.nLThreadsPerPackage) + panic("Should have %d Threads, but only found %d for package %d", + topoParms.nLThreadsPerPackage, nCPUs, pkg->lpkg_num); + + pkg = pkg->next; + } +} + #if TOPO_DEBUG /* * Prints out the topology diff --git a/osfmk/i386/cpu_topology.c b/osfmk/i386/cpu_topology.c index 6e823c980..58b15e913 100644 --- a/osfmk/i386/cpu_topology.c +++ b/osfmk/i386/cpu_topology.c @@ -45,6 +45,7 @@ #define DBG(x...) #endif void debug_topology_print(void); +void validate_topology(void); __private_extern__ void qsort( void * array, @@ -144,6 +145,7 @@ cpu_topology_start(void) #if TOPO_DEBUG debug_topology_print(); #endif /* TOPO_DEBUG */ + validate_topology(); ml_set_interrupts_enabled(istate); DBG("cpu_topology_start() LLC is L%d\n", topoParms.LLCDepth + 1); diff --git a/osfmk/i386/cpu_topology.h b/osfmk/i386/cpu_topology.h index f5cbbefb4..d4351e6b8 100644 --- a/osfmk/i386/cpu_topology.h +++ b/osfmk/i386/cpu_topology.h @@ -133,6 +133,7 @@ typedef struct x86_lcpu struct x86_die *die; /* die containing the logical cpu */ struct x86_pkg *package; /* package containing the logical cpu */ struct cpu_data *cpu; /* cpu_data structure */ + uint32_t flags; uint32_t cpu_num; /* cpu number */ uint32_t lnum; /* logical cpu number (within core) */ uint32_t pnum; /* physical cpu number */ @@ -150,8 +151,10 @@ typedef struct x86_lcpu #define X86CORE_FL_PRESENT 0x80000000 /* core is present */ #define X86CORE_FL_READY 0x40000000 /* core struct is init'd */ +#define X86CORE_FL_HAS_HPET 0x10000000 /* core has HPET assigned */ #define X86CORE_FL_HALTED 0x00008000 /* core is halted */ #define X86CORE_FL_IDLE 0x00004000 /* core is idle */ +#define X86CORE_FL_WAKEUP 0x00002000 /* wakeup is pending */ typedef struct x86_core { diff --git a/osfmk/i386/cpuid.c b/osfmk/i386/cpuid.c index f9c58c5bb..23a27ef29 100644 --- a/osfmk/i386/cpuid.c +++ b/osfmk/i386/cpuid.c @@ -376,6 +376,27 @@ cpuid_set_generic_info(i386_cpu_info_t *info_p) quad(cpuid_reg[ecx], cpuid_reg[edx]); } + /* Fold in the Invariant TSC feature bit, if present */ + if (max_extid >= 0x80000007) { + do_cpuid(0x80000007, cpuid_reg); + info_p->cpuid_extfeatures |= + cpuid_reg[edx] & CPUID_EXTFEATURE_TSCI; + } + + /* Find the microcode version number a.k.a. signature a.k.a. BIOS ID */ + info_p->cpuid_microcode_version = + (uint32_t) (rdmsr64(MSR_IA32_BIOS_SIGN_ID) >> 32); + + if (info_p->cpuid_model == CPUID_MODEL_NEHALEM) { + /* + * For Nehalem, find the number of enabled cores and threads + * (which determines whether SMT/Hyperthreading is active). + */ + uint64_t msr_core_thread_count = rdmsr64(MSR_CORE_THREAD_COUNT); + info_p->core_count = bitfield(msr_core_thread_count, 31, 16); + info_p->thread_count = bitfield(msr_core_thread_count, 15, 0); + } + if (info_p->cpuid_features & CPUID_FEATURE_MONITOR) { /* * Extract the Monitor/Mwait Leaf info: @@ -508,6 +529,8 @@ extfeature_map[] = { {CPUID_EXTFEATURE_XD, "XD"}, {CPUID_EXTFEATURE_EM64T, "EM64T"}, {CPUID_EXTFEATURE_LAHF, "LAHF"}, + {CPUID_EXTFEATURE_RDTSCP, "RDTSCP"}, + {CPUID_EXTFEATURE_TSCI, "TSCI"}, {0, 0} }; diff --git a/osfmk/i386/cpuid.h b/osfmk/i386/cpuid.h index 34eed7b4d..8e690a71f 100644 --- a/osfmk/i386/cpuid.h +++ b/osfmk/i386/cpuid.h @@ -107,10 +107,18 @@ */ #define CPUID_EXTFEATURE_SYSCALL _Bit(11) /* SYSCALL/sysret */ #define CPUID_EXTFEATURE_XD _Bit(20) /* eXecute Disable */ +#define CPUID_EXTFEATURE_RDTSCP _Bit(27) /* RDTSCP */ #define CPUID_EXTFEATURE_EM64T _Bit(29) /* Extended Mem 64 Technology */ #define CPUID_EXTFEATURE_LAHF _HBit(20) /* LAFH/SAHF instructions */ +/* + * The CPUID_EXTFEATURE_XXX values define 64-bit values + * returned in %ecx:%edx to a CPUID request with %eax of 0x80000007: + */ +#define CPUID_EXTFEATURE_TSCI _Bit(8) /* TSC Invariant */ + + #define CPUID_CACHE_SIZE 16 /* Number of descriptor vales */ #define CPUID_CACHE_NULL 0x00 /* NULL */ diff --git a/osfmk/i386/etimer.c b/osfmk/i386/etimer.c index ddbd77ffc..aacc02ebc 100644 --- a/osfmk/i386/etimer.c +++ b/osfmk/i386/etimer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -39,6 +39,7 @@ #include +#include #include #include #include @@ -55,9 +56,6 @@ #include #include -/* XXX from /rtclock.c */ -clock_timer_func_t rtclock_timer_expire; - /* * Event timer interrupt. * @@ -94,8 +92,7 @@ __unused uint64_t iaddr) /* has a pending clock timer expired? */ if (mytimer->deadline <= abstime) { /* Have we expired the deadline? */ mytimer->has_expired = TRUE; /* Remember that we popped */ - mytimer->deadline = EndOfAllTime; /* Set timer request to the end of all time in case we have no more events */ - (*rtclock_timer_expire)(abstime); /* Process pop */ + mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); mytimer->has_expired = FALSE; } @@ -105,7 +102,7 @@ __unused uint64_t iaddr) } /* - * Set the clock deadline; called by the thread scheduler. + * Set the clock deadline. */ void etimer_set_deadline(uint64_t deadline) { @@ -178,3 +175,59 @@ etimer_resync_deadlines(void) } splx(s); } + +void etimer_timer_expire(void *arg); + +void +etimer_timer_expire( +__unused void *arg) +{ + rtclock_timer_t *mytimer; + uint64_t abstime; + cpu_data_t *pp; + x86_lcpu_t *lcpu; + + pp = current_cpu_datap(); + lcpu = x86_lcpu(); + + mytimer = &pp->rtclock_timer; + abstime = mach_absolute_time(); + + mytimer->has_expired = TRUE; + mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); + mytimer->has_expired = FALSE; + + lcpu->rtcPop = EndOfAllTime; + etimer_resync_deadlines(); +} + +queue_t +timer_queue_assign( + uint64_t deadline) +{ + cpu_data_t *cdp = current_cpu_datap(); + rtclock_timer_t *timer; + + if (cdp->cpu_running) { + timer = &cdp->rtclock_timer; + + if (deadline < timer->deadline) + etimer_set_deadline(deadline); + } + else + timer = &cpu_datap(master_cpu)->rtclock_timer; + + return (&timer->queue); +} + +void +timer_queue_cancel( + queue_t queue, + uint64_t deadline, + uint64_t new_deadline) +{ + if (queue == ¤t_cpu_datap()->rtclock_timer.queue) { + if (deadline < new_deadline) + etimer_set_deadline(new_deadline); + } +} diff --git a/osfmk/i386/i386_lock.s b/osfmk/i386/i386_lock.s index b5153c73d..e4d7f7c1e 100644 --- a/osfmk/i386/i386_lock.s +++ b/osfmk/i386/i386_lock.s @@ -420,6 +420,7 @@ LEAF_ENTRY(hw_lock_to) push %ebx mov %edx,%edi + lfence rdtsc /* read cyclecount into %edx:%eax */ lfence addl %ecx,%eax /* fetch and timeout */ @@ -442,6 +443,7 @@ LEAF_ENTRY(hw_lock_to) /* * Here after spinning INNER_LOOP_COUNT times, check for timeout */ + lfence rdtsc /* cyclecount into %edx:%eax */ lfence cmpl %ecx,%edx /* compare high-order 32-bits */ diff --git a/osfmk/i386/i386_vm_init.c b/osfmk/i386/i386_vm_init.c index 797022979..40086ffd1 100644 --- a/osfmk/i386/i386_vm_init.c +++ b/osfmk/i386/i386_vm_init.c @@ -382,7 +382,8 @@ i386_vm_init(uint64_t maxmem, avail_start = first_avail; mem_actual = sane_size; -#define MEG (1024*1024) +#define MEG (1024*1024ULL) +#define GIG (1024*MEG) /* * For user visible memory size, round up to 128 Mb - accounting for the various stolen memory @@ -391,6 +392,19 @@ i386_vm_init(uint64_t maxmem, sane_size = (sane_size + 128 * MEG - 1) & ~((uint64_t)(128 * MEG - 1)); +#if defined(__i386__) +#define K32_MAXMEM (32*GIG) + /* + * For K32 we cap at K32_MAXMEM GB (currently 32GB). + * Unless overriden by the maxmem= boot-arg + * -- which is a non-zero maxmem argument to this function. + */ + if (maxmem == 0 && sane_size > K32_MAXMEM) { + maxmem = K32_MAXMEM; + printf("Physical memory %lld bytes capped at %dGB for 32-bit kernel\n", + sane_size, (uint32_t) (K32_MAXMEM/GIG)); + } +#endif /* * if user set maxmem, reduce memory sizes */ diff --git a/osfmk/i386/lapic.c b/osfmk/i386/lapic.c index 1dd1212db..ef37b72fa 100644 --- a/osfmk/i386/lapic.c +++ b/osfmk/i386/lapic.c @@ -78,6 +78,15 @@ static i386_intr_func_t lapic_intr_func[LAPIC_FUNC_TABLE_SIZE]; /* TRUE if local APIC was enabled by the OS not by the BIOS */ static boolean_t lapic_os_enabled = FALSE; +static boolean_t lapic_errors_masked = FALSE; +static uint64_t lapic_last_master_error = 0; +static uint64_t lapic_error_time_threshold = 0; +static unsigned lapic_master_error_count = 0; +static unsigned lapic_error_count_threshold = 5; +static boolean_t lapic_dont_panic = FALSE; + +extern int debug_boot_arg; + /* Base vector for local APIC interrupt sources */ int lapic_interrupt_base = LAPIC_DEFAULT_INTERRUPT_BASE; @@ -255,6 +264,12 @@ lapic_dump(void) BOOL(LAPIC_READ(SVR)&LAPIC_SVR_ENABLE), BOOL(!(LAPIC_READ(SVR)&LAPIC_SVR_FOCUS_OFF)), LAPIC_READ(SVR) & LAPIC_SVR_MASK); + if (mca_is_cmci_present()) + kprintf("LVT_CMCI: Vector 0x%02x [%s] %s %cmasked\n", + VEC(LVT_CMCI), + DM(LVT_CMCI), + DS(LVT_CMCI), + MASK(LVT_CMCI)); kprintf("LVT_TIMER: Vector 0x%02x %s %cmasked %s\n", VEC(LVT_TIMER), DS(LVT_TIMER), @@ -386,15 +401,15 @@ lapic_shutdown(void) LAPIC_WRITE(LVT_LINT0, value); } + /* Error: masked */ + LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); + /* Timer: masked */ LAPIC_WRITE(LVT_TIMER, LAPIC_READ(LVT_TIMER) | LAPIC_LVT_MASKED); /* Perfmon: masked */ LAPIC_WRITE(LVT_PERFCNT, LAPIC_READ(LVT_PERFCNT) | LAPIC_LVT_MASKED); - /* Error: masked */ - LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); - /* APIC software disabled */ LAPIC_WRITE(SVR, LAPIC_READ(SVR) & ~LAPIC_SVR_ENABLE); @@ -412,6 +427,13 @@ lapic_configure(void) { int value; + if (lapic_error_time_threshold == 0 && cpu_number() == 0) { + nanoseconds_to_absolutetime(NSEC_PER_SEC >> 2, &lapic_error_time_threshold); + if (!PE_parse_boot_argn("lapic_dont_panic", &lapic_dont_panic, sizeof(lapic_dont_panic))) { + lapic_dont_panic = FALSE; + } + } + /* Set flat delivery model, logical processor id */ LAPIC_WRITE(DFR, LAPIC_DFR_FLAT); LAPIC_WRITE(LDR, (get_cpu_number()) << LAPIC_LDR_SHIFT); @@ -438,9 +460,15 @@ lapic_configure(void) /* Thermal: unmasked */ LAPIC_WRITE(LVT_THERMAL, LAPIC_VECTOR(THERMAL)); - lapic_esr_clear(); + /* CMCI, if available */ + if (mca_is_cmci_present()) + LAPIC_WRITE(LVT_CMCI, LAPIC_VECTOR(CMCI)); - LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR)); + if (((cpu_number() == master_cpu) && lapic_errors_masked == FALSE) || + (cpu_number() != master_cpu)) { + lapic_esr_clear(); + LAPIC_WRITE(LVT_ERROR, LAPIC_VECTOR(ERROR)); + } } void @@ -510,6 +538,7 @@ lapic_set_intr_func(int vector, i386_intr_func_t func) case LAPIC_TIMER_INTERRUPT: case LAPIC_THERMAL_INTERRUPT: case LAPIC_PERFCNT_INTERRUPT: + case LAPIC_CMCI_INTERRUPT: lapic_intr_func[vector] = func; break; default: @@ -522,6 +551,7 @@ int lapic_interrupt(int interrupt, x86_saved_state_t *state) { int retval = 0; + int esr = -1; interrupt -= lapic_interrupt_base; if (interrupt < 0) { @@ -538,17 +568,64 @@ lapic_interrupt(int interrupt, x86_saved_state_t *state) switch(interrupt) { case LAPIC_TIMER_INTERRUPT: case LAPIC_THERMAL_INTERRUPT: + case LAPIC_PERFCNT_INTERRUPT: case LAPIC_INTERPROCESSOR_INTERRUPT: if (lapic_intr_func[interrupt] != NULL) (void) (*lapic_intr_func[interrupt])(state); if (interrupt == LAPIC_PERFCNT_INTERRUPT) + /* Clear interrupt masked */ LAPIC_WRITE(LVT_PERFCNT, LAPIC_VECTOR(PERFCNT)); _lapic_end_of_interrupt(); retval = 1; break; + case LAPIC_CMCI_INTERRUPT: + if (lapic_intr_func[interrupt] != NULL) + (void) (*lapic_intr_func[interrupt])(state); + /* return 0 for plaform expert to handle */ + break; case LAPIC_ERROR_INTERRUPT: + /* We treat error interrupts on APs as fatal. + * The current interrupt steering scheme directs most + * external interrupts to the BSP (HPET interrupts being + * a notable exception); hence, such an error + * on an AP may signify LVT corruption (with "may" being + * the operative word). On the BSP, we adopt a more + * lenient approach, in the interests of enhancing + * debuggability and reducing fragility. + * If "lapic_error_count_threshold" error interrupts + * occur within "lapic_error_time_threshold" absolute + * time units, we mask the error vector and log. The + * error interrupts themselves are likely + * side effects of issues which are beyond the purview of + * the local APIC interrupt handler, however. The Error + * Status Register value (the illegal destination + * vector code is one observed in practice) indicates + * the immediate cause of the error. + */ + esr = lapic_esr_read(); lapic_dump(); - panic("Local APIC error\n"); + + if ((debug_boot_arg && (lapic_dont_panic == FALSE)) || + cpu_number() != master_cpu) { + panic("Local APIC error, ESR: %d\n", esr); + } + + if (cpu_number() == master_cpu) { + uint64_t abstime = mach_absolute_time(); + if ((abstime - lapic_last_master_error) < lapic_error_time_threshold) { + if (lapic_master_error_count++ > lapic_error_count_threshold) { + lapic_errors_masked = TRUE; + LAPIC_WRITE(LVT_ERROR, LAPIC_READ(LVT_ERROR) | LAPIC_LVT_MASKED); + printf("Local APIC: errors masked\n"); + } + } + else { + lapic_last_master_error = abstime; + lapic_master_error_count = 0; + } + printf("Local APIC error on master CPU, ESR: %d, error count this run: %d\n", esr, lapic_master_error_count); + } + _lapic_end_of_interrupt(); retval = 1; break; diff --git a/osfmk/i386/lapic.h b/osfmk/i386/lapic.h index 4fa855676..b37b3a789 100644 --- a/osfmk/i386/lapic.h +++ b/osfmk/i386/lapic.h @@ -62,6 +62,7 @@ #define LAPIC_TMR_BASE 0x00000180 #define LAPIC_IRR_BASE 0x00000200 #define LAPIC_ERROR_STATUS 0x00000280 +#define LAPIC_LVT_CMCI 0x000002F0 #define LAPIC_ICR 0x00000300 #define LAPIC_ICR_VECTOR_MASK 0x000FF #define LAPIC_ICR_DM_MASK 0x00700 @@ -238,6 +239,10 @@ static inline void lapic_set_thermal_func(i386_intr_func_t func) { lapic_set_intr_func(LAPIC_VECTOR(THERMAL), func); } +static inline void lapic_set_cmci_func(i386_intr_func_t func) +{ + lapic_set_intr_func(LAPIC_VECTOR(CMCI), func); +} #ifdef MP_DEBUG #define LAPIC_CPU_MAP_DUMP() lapic_cpu_map_dump() diff --git a/osfmk/i386/machine_check.c b/osfmk/i386/machine_check.c index 23f26fc50..6eaff9d8f 100644 --- a/osfmk/i386/machine_check.c +++ b/osfmk/i386/machine_check.c @@ -47,6 +47,7 @@ static boolean_t mca_control_MSR_present = FALSE; static boolean_t mca_threshold_status_present = FALSE; static boolean_t mca_extended_MSRs_present = FALSE; static unsigned int mca_extended_MSRs_count = 0; +static boolean_t mca_cmci_present = FALSE; static ia32_mcg_cap_t ia32_mcg_cap; decl_simple_lock_data(static, mca_lock); @@ -88,6 +89,7 @@ mca_get_availability(void) mca_error_bank_count = ia32_mcg_cap.bits.count; mca_control_MSR_present = ia32_mcg_cap.bits.mcg_ctl_p; mca_threshold_status_present = ia32_mcg_cap.bits.mcg_tes_p; + mca_cmci_present = ia32_mcg_cap.bits.mcg_ext_corr_err_p; if (family == 0x0F) { mca_extended_MSRs_present = ia32_mcg_cap.bits.mcg_ext_p; mca_extended_MSRs_count = ia32_mcg_cap.bits.mcg_ext_cnt; @@ -144,6 +146,14 @@ mca_cpu_init(void) } } +boolean_t +mca_is_cmci_present(void) +{ + if (!mca_initialized) + mca_cpu_init(); + return mca_cmci_present; +} + void mca_cpu_alloc(cpu_data_t *cdp) { @@ -195,6 +205,13 @@ mca_save_state(mca_state_t *mca_state) bank->mca_mci_addr = (bank->mca_mci_status.bits.addrv)? rdmsr64(IA32_MCi_ADDR(i)) : 0ULL; } + + /* + * If we're the first thread with MCA state, point our package to it + * and don't care about races + */ + if (x86_package()->mca_state == NULL) + x86_package()->mca_state = mca_state; } void @@ -265,6 +282,78 @@ mca_report_cpu_info(void) kdb_printf(" %s\n", infop->cpuid_brand_string); } +static const char *mc8_memory_operation[] = { + [MC8_MMM_GENERIC] "generic", + [MC8_MMM_READ] "read", + [MC8_MMM_WRITE] "write", + [MC8_MMM_ADDRESS_COMMAND] "address/command", + [MC8_MMM_RESERVED] "reserved" +}; + +static void +mca_dump_bank_mc8(mca_state_t *state, int i) +{ + mca_mci_bank_t *bank; + ia32_mci_status_t status; + struct ia32_mc8_specific mc8; + int mmm; + + bank = &state->mca_error_bank[i]; + status = bank->mca_mci_status; + mc8 = status.bits_mc8; + mmm = MIN(mc8.memory_operation, MC8_MMM_RESERVED); + + kdb_printf( + " IA32_MC%d_STATUS(0x%x): 0x%016qx %svalid\n", + i, IA32_MCi_STATUS(i), status.u64, IF(!status.bits.val, "in")); + if (!status.bits.val) + return; + + kdb_printf( + " Channel number: %d%s\n" + " Memory Operation: %s\n" + " Machine-specific error: %s%s%s%s%s%s%s%s\n" + " COR_ERR_CNT: %d\n", + mc8.channel_number, + IF(mc8.channel_number == 15, " (unknown)"), + mc8_memory_operation[mmm], + IF(mc8.read_ecc, "Read ECC"), + IF(mc8.ecc_on_a_scrub, "ECC on scrub"), + IF(mc8.write_parity, "Write parity"), + IF(mc8.redundant_memory, "Redundant memory"), + IF(mc8.sparing, "Sparing/Resilvering"), + IF(mc8.access_out_of_range, "Access out of Range"), + IF(mc8.address_parity, "Address Parity"), + IF(mc8.byte_enable_parity, "Byte Enable Parity"), + mc8.cor_err_cnt); + kdb_printf( + " Status bits:\n%s%s%s%s%s%s", + IF(status.bits.pcc, " Processor context corrupt\n"), + IF(status.bits.addrv, " ADDR register valid\n"), + IF(status.bits.miscv, " MISC register valid\n"), + IF(status.bits.en, " Error enabled\n"), + IF(status.bits.uc, " Uncorrected error\n"), + IF(status.bits.over, " Error overflow\n")); + if (status.bits.addrv) + kdb_printf( + " IA32_MC%d_ADDR(0x%x): 0x%016qx\n", + i, IA32_MCi_ADDR(i), bank->mca_mci_addr); + if (status.bits.miscv) { + ia32_mc8_misc_t mc8_misc; + + mc8_misc.u64 = bank->mca_mci_misc; + kdb_printf( + " IA32_MC%d_MISC(0x%x): 0x%016qx\n" + " DIMM: %d\n" + " Channel: %d\n" + " Syndrome: 0x%x\n", + i, IA32_MCi_MISC(i), mc8_misc.u64, + mc8_misc.bits.dimm, + mc8_misc.bits.channel, + (int) mc8_misc.bits.syndrome); + } +} + static const char *mca_threshold_status[] = { [THRESHOLD_STATUS_NO_TRACKING] "No tracking", [THRESHOLD_STATUS_GREEN] "Green", @@ -331,6 +420,37 @@ mca_dump_error_banks(mca_state_t *state) kdb_printf("MCA error-reporting registers:\n"); for (i = 0; i < mca_error_bank_count; i++ ) { + if (i == 8) { + /* + * Fatal Memory Error + */ + + /* Dump MC8 for local package */ + kdb_printf(" Package %d logged:\n", + x86_package()->ppkg_num); + mca_dump_bank_mc8(state, 8); + + /* If there's other packages, report their MC8s */ + x86_pkg_t *pkg; + uint64_t deadline; + for (pkg = x86_pkgs; pkg != NULL; pkg = pkg->next) { + if (pkg == x86_package()) + continue; + deadline = mach_absolute_time() + LockTimeOut; + while (pkg->mca_state == NULL && + mach_absolute_time() < deadline) + cpu_pause(); + if (pkg->mca_state) { + kdb_printf(" Package %d logged:\n", + pkg->ppkg_num); + mca_dump_bank_mc8(pkg->mca_state, 8); + } else { + kdb_printf(" Package %d timed out!\n", + pkg->ppkg_num); + } + } + continue; + } mca_dump_bank(state, i); } } @@ -376,7 +496,8 @@ mca_dump(void) " control MSR present\n"), IF(mca_threshold_status_present, " threshold-based error status present\n"), - ""); + IF(mca_cmci_present, + " extended corrected memory error handling present\n")); if (mca_extended_MSRs_present) kdb_printf( " %d extended MSRs present\n", mca_extended_MSRs_count); diff --git a/osfmk/i386/machine_check.h b/osfmk/i386/machine_check.h index 233e78e2c..7ecf69403 100644 --- a/osfmk/i386/machine_check.h +++ b/osfmk/i386/machine_check.h @@ -49,9 +49,10 @@ typedef union { uint64_t count :BITS(7,0); uint64_t mcg_ctl_p :BIT1(8); uint64_t mcg_ext_p :BIT1(9); - uint64_t mcg_reserved1 :BIT1(10); + uint64_t mcg_ext_corr_err_p :BIT1(10); uint64_t mcg_tes_p :BIT1(11); - uint64_t mcg_reserved2 :BITS(15,12); + uint64_t mcg_ecms :BIT1(12); + uint64_t mcg_reserved2 :BITS(15,13); uint64_t mcg_ext_cnt :BITS(23,16); } bits; uint64_t u64; @@ -123,7 +124,7 @@ typedef union { uint64_t over :BIT1(62); uint64_t val :BIT1(63); } bits; - struct { /* Variant if threshold-based error status present: */ + struct { /* Variant if threshold-based error status present: */ uint64_t mca_error :BITS(15,0); uint64_t model_specific_error :BITS(31,16); uint64_t other_information :BITS(52,32); @@ -136,6 +137,21 @@ typedef union { uint64_t over :BIT1(62); uint64_t val :BIT1(63); } bits_tes_p; + struct ia32_mc8_specific { + uint64_t channel_number :BITS(3,0); + uint64_t memory_operation :BITS(6,4); + uint64_t unused :BITS(15,7); + uint64_t read_ecc :BIT1(16); + uint64_t ecc_on_a_scrub :BIT1(17); + uint64_t write_parity :BIT1(18); + uint64_t redundant_memory :BIT1(19); + uint64_t sparing :BIT1(20); + uint64_t access_out_of_range :BIT1(21); + uint64_t address_parity :BIT1(23); + uint64_t byte_enable_parity :BIT1(24); + uint64_t reserved :BITS(37,25); + uint64_t cor_err_cnt :BITS(52,38); + } bits_mc8; uint64_t u64; } ia32_mci_status_t; @@ -145,6 +161,24 @@ typedef union { #define THRESHOLD_STATUS_YELLOW 2 #define THRESHOLD_STATUS_RESERVED 3 +/* MC8 memory operations encoding: */ +#define MC8_MMM_GENERIC 0 +#define MC8_MMM_READ 1 +#define MC8_MMM_WRITE 2 +#define MC8_MMM_ADDRESS_COMMAND 3 +#define MC8_MMM_RESERVED 4 + +typedef union { + struct { + uint64_t reserved1 :BITS(15,0); + uint64_t dimm :BITS(17,16); + uint64_t channel :BITS(19,18); + uint64_t reserved2 :BITS(31,20); + uint64_t syndrome :BITS(63,32); + } bits; + uint64_t u64; +} ia32_mc8_misc_t; + typedef uint64_t ia32_mci_addr_t; typedef uint64_t ia32_mci_misc_t; @@ -189,6 +223,7 @@ extern void mca_cpu_alloc(cpu_data_t *cdp); extern void mca_cpu_init(void); extern void mca_dump(void); extern void mca_check_save(void); +extern boolean_t mca_is_cmci_present(void); #endif /* _I386_MACHINE_CHECK_H_ */ #endif /* KERNEL_PRIVATE */ diff --git a/osfmk/i386/machine_routines.c b/osfmk/i386/machine_routines.c index d42f6d2f1..019d7f82f 100644 --- a/osfmk/i386/machine_routines.c +++ b/osfmk/i386/machine_routines.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -296,7 +296,7 @@ void machine_signal_idle( processor_t processor) { - cpu_interrupt(PROCESSOR_DATA(processor, slot_num)); + cpu_interrupt(processor->cpu_num); } thread_t diff --git a/osfmk/i386/machine_routines_asm.s b/osfmk/i386/machine_routines_asm.s index f68b81376..bb4095af3 100644 --- a/osfmk/i386/machine_routines_asm.s +++ b/osfmk/i386/machine_routines_asm.s @@ -47,6 +47,7 @@ ENTRY(ml_get_timebase) movl S_ARG0, %ecx + lfence rdtsc lfence @@ -235,7 +236,9 @@ Lslow: pushl %esi /* save generation */ pushl RNT_SHIFT(%edi) /* save low 32 bits of tscFreq */ - rdtsc /* get TSC in %edx:%eax */ + lfence + rdtsc /* get TSC in %edx:%eax */ + lfence subl RNT_TSC_BASE(%edi),%eax sbbl RNT_TSC_BASE+4(%edi),%edx diff --git a/osfmk/i386/mp.c b/osfmk/i386/mp.c index 507399783..00db14a66 100644 --- a/osfmk/i386/mp.c +++ b/osfmk/i386/mp.c @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -145,6 +146,7 @@ decl_mutex_data(static, mp_bc_lock); static volatile int debugger_cpu = -1; static void mp_cpus_call_action(void); +static void mp_call_PM(void); #if GPROF /* @@ -208,13 +210,51 @@ mp_wait_for_cpu_up(int slot_num, unsigned int iters, unsigned int usecdelay) } } +typedef struct { + int target_cpu; + int target_lapic; + int starter_cpu; + boolean_t is_nehalem; +} processor_start_info_t; + +static processor_start_info_t start_info; + +static void +start_cpu(void *arg) +{ + int i = 1000; + processor_start_info_t *psip = (processor_start_info_t *) arg; + + /* Ignore this if the current processor is not the starter */ + if (cpu_number() != psip->starter_cpu) + return; + + LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); + delay(psip->is_nehalem ? 100 : 10000); + + LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12)); + + if (!psip->is_nehalem) { + delay(200); + LAPIC_WRITE(ICRD, psip->target_lapic << LAPIC_ICRD_DEST_SHIFT); + LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12)); + } + +#ifdef POSTCODE_DELAY + /* Wait much longer if postcodes are displayed for a delay period. */ + i *= 10000; +#endif + mp_wait_for_cpu_up(psip->target_cpu, i*100, 100); +} + kern_return_t intel_startCPU( int slot_num) { - - int i = 1000; - int lapic = cpu_to_lapic[slot_num]; + int lapic = cpu_to_lapic[slot_num]; + boolean_t istate; assert(lapic != -1); @@ -232,35 +272,33 @@ intel_startCPU( else cpu_desc_init(cpu_datap(slot_num), FALSE); - /* Serialize use of the slave boot stack. */ + /* Serialize use of the slave boot stack, etc. */ mutex_lock(&mp_cpu_boot_lock); - mp_disable_preemption(); + istate = ml_set_interrupts_enabled(FALSE); if (slot_num == get_cpu_number()) { - mp_enable_preemption(); + ml_set_interrupts_enabled(istate); mutex_unlock(&mp_cpu_boot_lock); return KERN_SUCCESS; } - LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_INIT); - delay(10000); - - LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12)); - delay(200); + start_info.starter_cpu = cpu_number(); + start_info.is_nehalem = (cpuid_info()->cpuid_model + == CPUID_MODEL_NEHALEM); + start_info.target_cpu = slot_num; + start_info.target_lapic = lapic; - LAPIC_WRITE(ICRD, lapic << LAPIC_ICRD_DEST_SHIFT); - LAPIC_WRITE(ICR, LAPIC_ICR_DM_STARTUP|(MP_BOOT>>12)); - delay(200); - -#ifdef POSTCODE_DELAY - /* Wait much longer if postcodes are displayed for a delay period. */ - i *= 10000; -#endif - mp_wait_for_cpu_up(slot_num, i, 10000); + /* + * For Nehalem, perform the processor startup with all running + * processors rendezvous'ed. This is required during periods when + * the cache-disable bit is set for MTRR/PAT initialization. + */ + if (start_info.is_nehalem) + mp_rendezvous_no_intrs(start_cpu, (void *) &start_info); + else + start_cpu((void *) &start_info); - mp_enable_preemption(); + ml_set_interrupts_enabled(istate); mutex_unlock(&mp_cpu_boot_lock); if (!cpu_datap(slot_num)->cpu_running) { @@ -432,6 +470,10 @@ cpu_signal_handler(x86_saved_state_t *regs) DBGLOG(cpu_handle,my_cpu,MP_CALL); i_bit_clear(MP_CALL, my_word); mp_cpus_call_action(); + } else if (i_bit(MP_CALL_PM, my_word)) { + DBGLOG(cpu_handle,my_cpu,MP_CALL_PM); + i_bit_clear(MP_CALL_PM, my_word); + mp_call_PM(); } } while (*my_word); @@ -548,6 +590,36 @@ cpu_NMI_interrupt(int cpu) } } +static volatile void (*mp_PM_func)(void) = NULL; + +static void +mp_call_PM(void) +{ + assert(!ml_get_interrupts_enabled()); + + if (mp_PM_func != NULL) + mp_PM_func(); +} + +void +cpu_PM_interrupt(int cpu) +{ + assert(!ml_get_interrupts_enabled()); + + if (mp_PM_func != NULL) { + if (cpu == cpu_number()) + mp_PM_func(); + else + i386_signal_cpu(cpu, MP_CALL_PM, ASYNC); + } +} + +void +PM_interrupt_register(void (*fn)(void)) +{ + mp_PM_func = fn; +} + void i386_signal_cpu(int cpu, mp_event_t event, mp_sync_t mode) { @@ -977,6 +1049,8 @@ i386_activate_cpu(void) simple_unlock(&x86_topo_lock); } +extern void etimer_timer_expire(void *arg); + void i386_deactivate_cpu(void) { @@ -988,6 +1062,10 @@ i386_deactivate_cpu(void) cdp->cpu_running = FALSE; simple_unlock(&x86_topo_lock); + timer_queue_shutdown(&cdp->rtclock_timer.queue); + cdp->rtclock_timer.deadline = EndOfAllTime; + mp_cpus_call(cpu_to_cpumask(master_cpu), ASYNC, etimer_timer_expire, NULL); + /* * In case a rendezvous/braodcast/call was initiated to this cpu * before we cleared cpu_running, we must perform any actions due. @@ -1188,7 +1266,7 @@ void cause_ast_check( processor_t processor) { - int cpu = PROCESSOR_DATA(processor, slot_num); + int cpu = processor->cpu_num; if (cpu != cpu_number()) { i386_signal_cpu(cpu, MP_AST, ASYNC); diff --git a/osfmk/i386/mp.h b/osfmk/i386/mp.h index 0fac0fbd5..d4b3551e7 100644 --- a/osfmk/i386/mp.h +++ b/osfmk/i386/mp.h @@ -164,6 +164,15 @@ extern cpu_t mp_cpus_call( void (*action_func)(void *), void *arg); +/* + * Power-management-specific SPI to: + * - register a callout function, and + * - request the callout (if registered) on a given cpu. + */ +extern void PM_interrupt_register(void (*fn)(void)); +extern void cpu_PM_interrupt(int cpu); + + __END_DECLS #if MP_DEBUG diff --git a/osfmk/i386/mp_desc.c b/osfmk/i386/mp_desc.c index 9e2df152e..798a191df 100644 --- a/osfmk/i386/mp_desc.c +++ b/osfmk/i386/mp_desc.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -488,13 +488,15 @@ cpu_data_alloc(boolean_t is_boot_cpu) if (is_boot_cpu) { assert(real_ncpus == 1); - simple_lock_init(&cpu_lock, 0); cdp = &cpu_data_master; if (cdp->cpu_processor == NULL) { + simple_lock_init(&cpu_lock, 0); cdp->cpu_processor = cpu_processor_alloc(TRUE); cdp->cpu_pmap = pmap_cpu_alloc(TRUE); cpu_desc_init(cdp, TRUE); fast_syscall_init(); + queue_init(&cdp->rtclock_timer.queue); + cdp->rtclock_timer.deadline = EndOfAllTime; } return cdp; } @@ -569,6 +571,8 @@ cpu_data_alloc(boolean_t is_boot_cpu) simple_unlock(&cpu_lock); cdp->cpu_nanotime = &rtc_nanotime_info; + queue_init(&cdp->rtclock_timer.queue); + cdp->rtclock_timer.deadline = EndOfAllTime; kprintf("cpu_data_alloc(%d) %p desc_table: %p " "ldt: %p " @@ -673,22 +677,25 @@ cpu_physwindow_init(int cpu) { cpu_data_t *cdp = cpu_data_ptr[cpu]; cpu_desc_index_t *cdi = &cdp->cpu_desc_index; - vm_offset_t phys_window; + vm_offset_t phys_window = cdp->cpu_physwindow_base; - if (vm_allocate(kernel_map, &phys_window, - PAGE_SIZE, VM_FLAGS_ANYWHERE) + if (phys_window == 0) { + if (vm_allocate(kernel_map, &phys_window, + PAGE_SIZE, VM_FLAGS_ANYWHERE) != KERN_SUCCESS) - panic("cpu_physwindow_init: couldn't allocate phys map window"); + panic("cpu_physwindow_init: " + "couldn't allocate phys map window"); - /* - * make sure the page that encompasses the - * pte pointer we're interested in actually - * exists in the page table - */ - pmap_expand(kernel_pmap, phys_window); + /* + * make sure the page that encompasses the + * pte pointer we're interested in actually + * exists in the page table + */ + pmap_expand(kernel_pmap, phys_window); - cdp->cpu_physwindow_base = phys_window; - cdp->cpu_physwindow_ptep = vtopte(phys_window); + cdp->cpu_physwindow_base = phys_window; + cdp->cpu_physwindow_ptep = vtopte(phys_window); + } cdi->cdi_gdt[sel_idx(PHYS_WINDOW_SEL)] = physwindow_desc_pattern; cdi->cdi_gdt[sel_idx(PHYS_WINDOW_SEL)].offset = phys_window; diff --git a/osfmk/i386/mp_events.h b/osfmk/i386/mp_events.h index 0da1d98c0..e870b0d03 100644 --- a/osfmk/i386/mp_events.h +++ b/osfmk/i386/mp_events.h @@ -45,6 +45,7 @@ typedef enum { MP_CHUD, MP_BROADCAST, MP_CALL, + MP_CALL_PM, MP_LAST } mp_event_t; @@ -60,6 +61,7 @@ const char *mp_event_name[] = { \ "MP_CHUD", \ "MP_BROADCAST", \ "MP_CALL", \ + "MP_CALL_PM", \ "MP_LAST" \ } diff --git a/osfmk/i386/pcb.c b/osfmk/i386/pcb.c index 04f150453..70d007f52 100644 --- a/osfmk/i386/pcb.c +++ b/osfmk/i386/pcb.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -1789,9 +1789,11 @@ machine_thread_switch_addrmode(thread_t thread) machine_thread_create(thread, thread->task); /* If we're switching ourselves, reset the pcb addresses etc. */ - if (thread == current_thread()) - act_machine_switch_pcb(thread); - + if (thread == current_thread()) { + if (current_cpu_datap()->cpu_active_cr3 != kernel_pmap->pm_cr3) + pmap_load_kernel_cr3(); + act_machine_switch_pcb(thread); + } enable_preemption(); } diff --git a/osfmk/i386/pmCPU.c b/osfmk/i386/pmCPU.c index d2efc8bc9..88aa0f87b 100644 --- a/osfmk/i386/pmCPU.c +++ b/osfmk/i386/pmCPU.c @@ -106,6 +106,7 @@ machine_idle(void) goto out; my_cpu->lcpu.state = LCPU_IDLE; + my_cpu->lcpu.flags |= X86CORE_FL_IDLE; DBGLOG(cpu_handle, cpu_number(), MP_IDLE); MARK_CPU_IDLE(cpu_number()); @@ -129,6 +130,7 @@ machine_idle(void) */ MARK_CPU_ACTIVE(cpu_number()); DBGLOG(cpu_handle, cpu_number(), MP_UNIDLE); + my_cpu->lcpu.flags &= ~(X86CORE_FL_IDLE | X86CORE_FL_WAKEUP); my_cpu->lcpu.state = LCPU_RUN; /* @@ -325,6 +327,7 @@ pmCPUExitIdle(cpu_data_t *cpu) { boolean_t do_ipi; + cpu->lcpu.flags |= X86CORE_FL_WAKEUP; if (pmInitDone && pmDispatch != NULL && pmDispatch->exitIdle != NULL) @@ -332,6 +335,9 @@ pmCPUExitIdle(cpu_data_t *cpu) else do_ipi = TRUE; + if (do_ipi) + cpu->lcpu.flags &= ~X86CORE_FL_WAKEUP; + return(do_ipi); } @@ -534,6 +540,34 @@ pmSafeMode(x86_lcpu_t *lcpu, uint32_t flags) } } +static uint32_t saved_run_count = 0; + +void +machine_run_count(uint32_t count) +{ + if (pmDispatch != NULL + && pmDispatch->pmSetRunCount != NULL) + pmDispatch->pmSetRunCount(count); + else + saved_run_count = count; +} + +boolean_t +machine_cpu_is_inactive(int cpu) +{ + if (pmDispatch != NULL + && pmDispatch->pmIsCPUUnAvailable != NULL) + return(pmDispatch->pmIsCPUUnAvailable(cpu_to_lcpu(cpu))); + else + return(FALSE); +} + +static uint32_t +pmGetSavedRunCount(void) +{ + return(saved_run_count); +} + /* * Returns the root of the package tree. */ @@ -555,6 +589,22 @@ pmLCPUtoProcessor(int lcpu) return(cpu_datap(lcpu)->cpu_processor); } +static void +pmReSyncDeadlines(int cpu) +{ + static boolean_t registered = FALSE; + + if (!registered) { + PM_interrupt_register(&etimer_resync_deadlines); + registered = TRUE; + } + + if ((uint32_t)cpu == current_cpu_datap()->lcpu.cpu_num) + etimer_resync_deadlines(); + else + cpu_PM_interrupt(cpu); +} + /* * Called by the power management kext to register itself and to get the * callbacks it might need into other kernel functions. This interface @@ -566,23 +616,26 @@ pmKextRegister(uint32_t version, pmDispatch_t *cpuFuncs, pmCallBacks_t *callbacks) { if (callbacks != NULL && version == PM_DISPATCH_VERSION) { - callbacks->setRTCPop = setPop; - callbacks->resyncDeadlines = etimer_resync_deadlines; - callbacks->initComplete= pmInitComplete; - callbacks->GetLCPU = pmGetLogicalCPU; - callbacks->GetCore = pmGetCore; - callbacks->GetDie = pmGetDie; - callbacks->GetPackage = pmGetPackage; - callbacks->GetMyLCPU = pmGetMyLogicalCPU; - callbacks->GetMyCore = pmGetMyCore; - callbacks->GetMyDie = pmGetMyDie; - callbacks->GetMyPackage= pmGetMyPackage; - callbacks->GetPkgRoot = pmGetPkgRoot; - callbacks->LockCPUTopology = pmLockCPUTopology; - callbacks->GetHibernate = pmCPUGetHibernate; - callbacks->LCPUtoProcessor = pmLCPUtoProcessor; - callbacks->ThreadBind = thread_bind; - callbacks->topoParms = &topoParms; + callbacks->setRTCPop = setPop; + callbacks->resyncDeadlines = pmReSyncDeadlines; + callbacks->initComplete = pmInitComplete; + callbacks->GetLCPU = pmGetLogicalCPU; + callbacks->GetCore = pmGetCore; + callbacks->GetDie = pmGetDie; + callbacks->GetPackage = pmGetPackage; + callbacks->GetMyLCPU = pmGetMyLogicalCPU; + callbacks->GetMyCore = pmGetMyCore; + callbacks->GetMyDie = pmGetMyDie; + callbacks->GetMyPackage = pmGetMyPackage; + callbacks->GetPkgRoot = pmGetPkgRoot; + callbacks->LockCPUTopology = pmLockCPUTopology; + callbacks->GetHibernate = pmCPUGetHibernate; + callbacks->LCPUtoProcessor = pmLCPUtoProcessor; + callbacks->ThreadBind = thread_bind; + callbacks->GetSavedRunCount = pmGetSavedRunCount; + callbacks->topoParms = &topoParms; + } else { + panic("Version mis-match between Kernel and CPU PM"); } if (cpuFuncs != NULL) { diff --git a/osfmk/i386/pmCPU.h b/osfmk/i386/pmCPU.h index ca3072b2a..6026f5ed6 100644 --- a/osfmk/i386/pmCPU.h +++ b/osfmk/i386/pmCPU.h @@ -37,7 +37,7 @@ * This value should be changed each time that pmDsipatch_t or pmCallBacks_t * changes. */ -#define PM_DISPATCH_VERSION 12 +#define PM_DISPATCH_VERSION 15 /* * Dispatch table for functions that get installed when the power @@ -69,11 +69,13 @@ typedef struct void (*pmTimerStateRestore)(void); kern_return_t (*exitHalt)(x86_lcpu_t *lcpu); void (*markAllCPUsOff)(void); + void (*pmSetRunCount)(uint32_t count); + boolean_t (*pmIsCPUUnAvailable)(x86_lcpu_t *lcpu); } pmDispatch_t; typedef struct { int (*setRTCPop)(uint64_t time); - void (*resyncDeadlines)(void); + void (*resyncDeadlines)(int cpu); void (*initComplete)(void); x86_lcpu_t *(*GetLCPU)(int cpu); x86_core_t *(*GetCore)(int cpu); @@ -88,6 +90,7 @@ typedef struct { boolean_t (*GetHibernate)(int cpu); processor_t (*LCPUtoProcessor)(int lcpu); processor_t (*ThreadBind)(processor_t proc); + uint32_t (*GetSavedRunCount)(void); x86_topology_parameters_t *topoParms; } pmCallBacks_t; diff --git a/osfmk/i386/pmap.c b/osfmk/i386/pmap.c index a424d7e11..75685d57d 100644 --- a/osfmk/i386/pmap.c +++ b/osfmk/i386/pmap.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -3581,12 +3581,6 @@ phys_attribute_clear( vm_map_offset_t va; va = pv_e->va; - /* - * first make sure any processor actively - * using this pmap, flushes its TLB state - */ - - PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); /* * Clear modify and/or reference bits. @@ -3594,7 +3588,13 @@ phys_attribute_clear( pte = pmap_pte(pmap, va); pmap_update_pte(pte, *pte, (*pte & ~bits)); - + /* Ensure all processors using this translation + * invalidate this TLB entry. The invalidation *must* follow + * the PTE update, to ensure that the TLB shadow of the + * 'D' bit (in particular) is synchronized with the + * updated PTE. + */ + PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE); } pv_e = (pv_hashed_entry_t)queue_next(&pv_e->qlink); diff --git a/osfmk/i386/proc_reg.h b/osfmk/i386/proc_reg.h index a74ca7548..a8eefb5b1 100644 --- a/osfmk/i386/proc_reg.h +++ b/osfmk/i386/proc_reg.h @@ -275,7 +275,7 @@ static inline void invlpg(unsigned long addr) __asm__ volatile("wrmsr" : : "c" (msr), "a" (lo), "d" (hi)) #define rdtsc(lo,hi) \ - __asm__ volatile("rdtsc; lfence" : "=a" (lo), "=d" (hi)) + __asm__ volatile("lfence; rdtsc; lfence" : "=a" (lo), "=d" (hi)) #define write_tsc(lo,hi) wrmsr(0x10, lo, hi) @@ -297,7 +297,17 @@ static inline void wrmsr64(uint32_t msr, uint64_t val) static inline uint64_t rdtsc64(void) { uint64_t ret; - __asm__ volatile("rdtsc; lfence" : "=A" (ret)); + __asm__ volatile("lfence; rdtsc; lfence" : "=A" (ret)); + return ret; +} + +static inline uint64_t rdtscp64(uint32_t *aux) +{ + uint64_t ret; + __asm__ volatile("rdtscp; mov %%ecx, %1" + : "=A" (ret), "=m" (*aux) + : + : "ecx"); return ret; } @@ -412,4 +422,8 @@ __END_DECLS #define MSR_IA32_BIOS_SIGN_ID 0x08B +#define MSR_FLEX_RATIO 0x194 +#define MSR_PLATFORM_INFO 0x0ce +#define MSR_CORE_THREAD_COUNT 0x035 + #endif /* _I386_PROC_REG_H_ */ diff --git a/osfmk/i386/rtclock.c b/osfmk/i386/rtclock.c index 982c160f4..4b06c8a1e 100644 --- a/osfmk/i386/rtclock.c +++ b/osfmk/i386/rtclock.c @@ -86,9 +86,6 @@ uint64_t rtc_decrementer_min; void rtclock_intr(x86_saved_state_t *regs); static uint64_t maxDec; /* longest interval our hardware timer can handle (nsec) */ -/* XXX this should really be in a header somewhere */ -extern clock_timer_func_t rtclock_timer_expire; - static void rtc_set_timescale(uint64_t cycles); static uint64_t rtc_export_speed(uint64_t cycles); @@ -461,14 +458,6 @@ clock_timebase_info( info->numer = info->denom = 1; } -void -clock_set_timer_func( - clock_timer_func_t func) -{ - if (rtclock_timer_expire == NULL) - rtclock_timer_expire = func; -} - /* * Real-time clock device interrupt. */ diff --git a/osfmk/i386/rtclock.h b/osfmk/i386/rtclock.h index e3ea716d4..6f3406a8c 100644 --- a/osfmk/i386/rtclock.h +++ b/osfmk/i386/rtclock.h @@ -83,6 +83,7 @@ extern rtc_nanotime_t rtc_nanotime_info; 0: movl RNT_GENERATION(%edi),%esi /* being updated? */ ; \ testl %esi,%esi ; \ jz 0b /* wait until done */ ; \ + lfence ; \ rdtsc ; \ lfence ; \ subl RNT_TSC_BASE(%edi),%eax ; \ @@ -111,6 +112,7 @@ extern rtc_nanotime_t rtc_nanotime_info; 0: movl RNT_GENERATION(%rdi),%esi ; \ test %esi,%esi /* info updating? */ ; \ jz 0b /* - wait if so */ ; \ + lfence ; \ rdtsc ; \ lfence ; \ shlq $32,%rdx ; \ diff --git a/osfmk/i386/tsc.c b/osfmk/i386/tsc.c index 624e5d431..669bc401f 100644 --- a/osfmk/i386/tsc.c +++ b/osfmk/i386/tsc.c @@ -148,12 +148,47 @@ tsc_init(void) cpuid_info()->cpuid_family); } - { + switch (cpuid_info()->cpuid_model) { + case CPUID_MODEL_NEHALEM: { + uint64_t cpu_mhz; + uint64_t msr_flex_ratio; + uint64_t msr_platform_info; + + /* See if FLEX_RATIO is being used */ + msr_flex_ratio = rdmsr64(MSR_FLEX_RATIO); + msr_platform_info = rdmsr64(MSR_PLATFORM_INFO); + flex_ratio_min = (uint32_t)bitfield(msr_platform_info, 47, 40); + flex_ratio_max = (uint32_t)bitfield(msr_platform_info, 15, 8); + /* No BIOS-programed flex ratio. Use hardware max as default */ + tscGranularity = flex_ratio_max; + if (msr_flex_ratio & bit(16)) { + /* Flex Enabled: Use this MSR if less than max */ + flex_ratio = (uint32_t)bitfield(msr_flex_ratio, 15, 8); + if (flex_ratio < flex_ratio_max) + tscGranularity = flex_ratio; + } + + /* If EFI isn't configured correctly, use a constant + * value. See 6036811. + */ + if (busFreq == 0) + busFreq = BASE_NHM_CLOCK_SOURCE; + + cpu_mhz = tscGranularity * BASE_NHM_CLOCK_SOURCE; + + kprintf("[NHM] Maximum Non-Turbo Ratio = [%d]\n", + (uint32_t)tscGranularity); + kprintf("[NHM] CPU: Frequency = %6d.%04dMhz\n", + (uint32_t)(cpu_mhz / Mega), (uint32_t)(cpu_mhz % Mega)); + break; + } + default: { uint64_t prfsts; prfsts = rdmsr64(IA32_PERF_STS); tscGranularity = (uint32_t)bitfield(prfsts, 44, 40); N_by_2_bus_ratio = (prfsts & bit(46)) != 0; + } } if (busFreq != 0) { diff --git a/osfmk/i386/tsc.h b/osfmk/i386/tsc.h index 1b6589de7..e702ec234 100644 --- a/osfmk/i386/tsc.h +++ b/osfmk/i386/tsc.h @@ -40,6 +40,7 @@ #ifndef _I386_TSC_H_ #define _I386_TSC_H_ +#define BASE_NHM_CLOCK_SOURCE 139806638ULL #define IA32_PERF_STS 0x198 extern uint64_t busFCvtt2n; diff --git a/osfmk/i386/vmx/vmx_cpu.c b/osfmk/i386/vmx/vmx_cpu.c index 31c16ca9f..c86af004e 100644 --- a/osfmk/i386/vmx/vmx_cpu.c +++ b/osfmk/i386/vmx/vmx_cpu.c @@ -368,6 +368,7 @@ void vmx_resume() { VMX_KPRINTF("vmx_resume\n"); + vmx_init(); /* init VMX on CPU #0 */ if (vmx_use_count) vmx_on(); } diff --git a/osfmk/ipc/ipc_kmsg.h b/osfmk/ipc/ipc_kmsg.h index 0c07ed8ea..db7e6acf4 100644 --- a/osfmk/ipc/ipc_kmsg.h +++ b/osfmk/ipc/ipc_kmsg.h @@ -241,30 +241,9 @@ MACRO_END /* * extern void * ipc_kmsg_send_always(ipc_kmsg_t); - * - * Unfortunately, to avoid warnings/lint about unused variables - * when assertions are turned off, we need two versions of this. */ -#if MACH_ASSERT - #define ipc_kmsg_send_always(kmsg) \ -MACRO_BEGIN \ - mach_msg_return_t mr2; \ - \ - mr2 = ipc_kmsg_send((kmsg), MACH_SEND_ALWAYS, \ - MACH_MSG_TIMEOUT_NONE); \ - assert(mr == MACH_MSG_SUCCESS); \ -MACRO_END - -#else /* MACH_ASSERT */ - -#define ipc_kmsg_send_always(kmsg) \ -MACRO_BEGIN \ - (void) ipc_kmsg_send((kmsg), MACH_SEND_ALWAYS, \ - MACH_MSG_TIMEOUT_NONE); \ -MACRO_END - -#endif /* MACH_ASSERT */ + ipc_kmsg_send((kmsg), MACH_SEND_ALWAYS, MACH_MSG_TIMEOUT_NONE) /* Allocate a kernel message */ diff --git a/osfmk/ipc/ipc_mqueue.c b/osfmk/ipc/ipc_mqueue.c index 66c3db6f1..316babd8d 100644 --- a/osfmk/ipc/ipc_mqueue.c +++ b/osfmk/ipc/ipc_mqueue.c @@ -361,9 +361,10 @@ ipc_mqueue_send( imq_lock(mqueue); if (!imq_full(mqueue) || - (option & MACH_SEND_ALWAYS) || - (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) == - MACH_MSG_TYPE_PORT_SEND_ONCE)) { + (!imq_full_kernel(mqueue) && + ((option & MACH_SEND_ALWAYS) || + (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) == + MACH_MSG_TYPE_PORT_SEND_ONCE)))) { mqueue->imq_msgcount++; assert(mqueue->imq_msgcount > 0); imq_unlock(mqueue); @@ -380,6 +381,11 @@ ipc_mqueue_send( splx(s); return MACH_SEND_TIMED_OUT; } + if (imq_full_kernel(mqueue)) { + imq_unlock(mqueue); + splx(s); + return MACH_SEND_NO_BUFFER; + } mqueue->imq_fullwaiters = TRUE; thread_lock(cur_thread); if (option & MACH_SEND_TIMEOUT) diff --git a/osfmk/ipc/ipc_mqueue.h b/osfmk/ipc/ipc_mqueue.h index 1fa4294f8..4ef47d969 100644 --- a/osfmk/ipc/ipc_mqueue.h +++ b/osfmk/ipc/ipc_mqueue.h @@ -112,6 +112,7 @@ typedef struct ipc_mqueue { #define imq_held(mq) wait_queue_held(&(mq)->imq_wait_queue) #define imq_full(mq) ((mq)->imq_msgcount >= (mq)->imq_qlimit) +#define imq_full_kernel(mq) ((mq)->imq_msgcount >= MACH_PORT_QLIMIT_KERNEL) extern int ipc_mqueue_full; extern int ipc_mqueue_rcv; diff --git a/osfmk/ipc/ipc_notify.c b/osfmk/ipc/ipc_notify.c index bbffba8bc..25a26aa63 100644 --- a/osfmk/ipc/ipc_notify.c +++ b/osfmk/ipc/ipc_notify.c @@ -84,13 +84,8 @@ ipc_notify_port_deleted( ipc_port_t port, mach_port_name_t name) { - kern_return_t kr; - - kr = mach_notify_port_deleted(port, name); - if (kr != KERN_SUCCESS) { - printf("dropped port-deleted (%p, 0x%x)\n", port, name); - ipc_port_release_sonce(port); - } + (void)mach_notify_port_deleted(port, name); + /* send-once right consumed */ } /* @@ -110,15 +105,8 @@ ipc_notify_port_destroyed( ipc_port_t port, ipc_port_t right) { - kern_return_t kr; - - kr = mach_notify_port_destroyed(port, right); - if (kr != KERN_SUCCESS) { - printf("dropped port-destroyed (%p, %p)\n", - port, right); - ipc_port_release_sonce(port); - ipc_port_release_receive(right); - } + mach_notify_port_destroyed(port, right); + /* send-once and receive rights consumed */ } /* @@ -135,13 +123,8 @@ ipc_notify_no_senders( ipc_port_t port, mach_port_mscount_t mscount) { - kern_return_t kr; - - kr = mach_notify_no_senders(port, mscount); - if (kr != KERN_SUCCESS) { - printf("dropped no-senders (%p, %u)\n", port, mscount); - ipc_port_release_sonce(port); - } + (void)mach_notify_no_senders(port, mscount); + /* send-once right consumed */ } /* @@ -157,13 +140,8 @@ void ipc_notify_send_once( ipc_port_t port) { - kern_return_t kr; - - kr = mach_notify_send_once(port); - if (kr != KERN_SUCCESS) { - printf("dropped send-once (%p)\n", port); - ipc_port_release_sonce(port); - } + (void)mach_notify_send_once(port); + /* send-once right consumed */ } /* @@ -180,11 +158,6 @@ ipc_notify_dead_name( ipc_port_t port, mach_port_name_t name) { - kern_return_t kr; - - kr = mach_notify_dead_name(port, name); - if (kr != KERN_SUCCESS) { - printf("dropped dead-name (%p, 0x%x)\n", port, name); - ipc_port_release_sonce(port); - } + (void)mach_notify_dead_name(port, name); + /* send-once right consumed */ } diff --git a/osfmk/ipc/ipc_right.c b/osfmk/ipc/ipc_right.c index 1d9d16596..e7ffd94ec 100644 --- a/osfmk/ipc/ipc_right.c +++ b/osfmk/ipc/ipc_right.c @@ -625,7 +625,7 @@ ipc_right_clean( } default: - panic("ipc_right_clean: strange type"); + panic("ipc_right_clean: strange type - 0x%x", type); } } diff --git a/osfmk/ipc/mach_msg.c b/osfmk/ipc/mach_msg.c index 065f3b40f..8d3f17e4b 100644 --- a/osfmk/ipc/mach_msg.c +++ b/osfmk/ipc/mach_msg.c @@ -1652,8 +1652,12 @@ mach_msg_overwrite_trap( (reply_port->ip_receiver_name != rcv_name) || (reply_port->ip_pset_count != 0)) { + /* try to enqueue by sending with an immediate timeout */ ip_unlock(reply_port); - ipc_kmsg_send_always(kmsg); + mr = ipc_kmsg_send(kmsg, MACH_SEND_TIMEOUT, 0); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_destroy(kmsg); + } HOT(c_mmot_cold_052++); goto slow_get_rcv_port; } @@ -1668,6 +1672,8 @@ mach_msg_overwrite_trap( * If there are messages on the port * or other threads waiting for a message, * we cannot directly receive the reply. + * Try to enqueue it by sending with an + * immediate timeout. */ if (!wait_queue_empty(&rcv_mqueue->imq_wait_queue) || (ipc_kmsg_queue_first(&rcv_mqueue->imq_messages) != IKM_NULL)) @@ -1675,7 +1681,10 @@ mach_msg_overwrite_trap( imq_unlock(rcv_mqueue); splx(s); ip_unlock(reply_port); - ipc_kmsg_send_always(kmsg); + mr = ipc_kmsg_send(kmsg, MACH_SEND_TIMEOUT, 0); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_destroy(kmsg); + } HOT(c_mmot_cold_053++); goto slow_get_rcv_port; } diff --git a/osfmk/kern/ast.c b/osfmk/kern/ast.c index f0b75b59a..9a0e95ec6 100644 --- a/osfmk/kern/ast.c +++ b/osfmk/kern/ast.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -164,14 +164,9 @@ ast_taken( /* * Check for preemption. */ - if (reasons & AST_PREEMPT) { - processor_t myprocessor = current_processor(); + if (reasons & AST_PREEMPT) + reasons = csw_check(current_processor()); - if (csw_needed(thread, myprocessor)) - reasons = AST_PREEMPT; - else - reasons = AST_NONE; - } if ( (reasons & AST_PREEMPT) && wait_queue_assert_possible(thread) ) { counter(c_ast_taken_block++); @@ -205,7 +200,7 @@ ast_check( /* * Context switch check. */ - if ((preempt = csw_check(thread, processor)) != AST_NONE) + if ((preempt = csw_check(processor)) != AST_NONE) ast_on(preempt); } } diff --git a/osfmk/kern/call_entry.h b/osfmk/kern/call_entry.h index 0990ad142..57ab51d5e 100644 --- a/osfmk/kern/call_entry.h +++ b/osfmk/kern/call_entry.h @@ -1,6 +1,5 @@ /* - * Copyright (c) 1993-1995, 1999-2000 Apple Computer, Inc. - * All rights reserved. + * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -27,15 +26,7 @@ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* - * Private declarations for thread-based callouts. - * - * HISTORY - * - * 10 July 1999 (debo) - * Pulled into Mac OS X (microkernel). - * - * 3 July 1993 (debo) - * Created. + * Declarations for generic call outs. */ #ifndef _KERN_CALL_ENTRY_H_ @@ -51,21 +42,32 @@ typedef void (*call_entry_func_t)( typedef struct call_entry { queue_chain_t q_link; + queue_t queue; call_entry_func_t func; call_entry_param_t param0; call_entry_param_t param1; uint64_t deadline; - enum { - IDLE, - PENDING, - DELAYED } state; } call_entry_data_t; +typedef struct call_entry *call_entry_t; + +extern queue_t call_entry_enqueue_deadline( + call_entry_t entry, + queue_t queue, + uint64_t deadline); + +extern queue_t call_entry_enqueue_tail( + call_entry_t entry, + queue_t queue); + +extern queue_t call_entry_dequeue( + call_entry_t entry); + #define call_entry_setup(entry, pfun, p0) \ MACRO_BEGIN \ (entry)->func = (call_entry_func_t)(pfun); \ (entry)->param0 = (call_entry_param_t)(p0); \ - (entry)->state = IDLE; \ + (entry)->queue = NULL; \ MACRO_END #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/clock.h b/osfmk/kern/clock.h index de8086406..5ca49ea74 100644 --- a/osfmk/kern/clock.h +++ b/osfmk/kern/clock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -46,6 +46,8 @@ #ifdef MACH_KERNEL_PRIVATE +#include + /* * Clock operations list structure. Contains vectors to machine * dependent clock routines. @@ -96,15 +98,6 @@ extern void clock_timebase_init(void); */ extern void clock_service_create(void); -typedef void (*clock_timer_func_t)( - uint64_t timestamp); - -extern void clock_set_timer_func( - clock_timer_func_t func); - -extern void clock_set_timer_deadline( - uint64_t deadline); - extern void clock_gettimeofday_set_commpage( uint64_t abstime, uint64_t epoch, diff --git a/osfmk/kern/debug.c b/osfmk/kern/debug.c index 679e1779c..599d1670e 100644 --- a/osfmk/kern/debug.c +++ b/osfmk/kern/debug.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -106,10 +107,15 @@ unsigned int panic_is_inited = 0; unsigned int return_on_panic = 0; unsigned long panic_caller; -char debug_buf[PAGE_SIZE]; -ppnum_t debug_buf_page; -char *debug_buf_ptr; -unsigned int debug_buf_size; +#if CONFIG_EMBEDDED +#define DEBUG_BUF_SIZE (PAGE_SIZE) +#else +#define DEBUG_BUF_SIZE (3 * PAGE_SIZE) +#endif + +char debug_buf[DEBUG_BUF_SIZE]; +char *debug_buf_ptr = debug_buf; +unsigned int debug_buf_size = sizeof(debug_buf); static char model_name[64]; @@ -184,9 +190,7 @@ debug_log_init(void) if (debug_buf_size != 0) return; debug_buf_ptr = debug_buf; - debug_buf_size = PAGE_SIZE; - debug_buf_page = pmap_find_phys(kernel_pmap, - (addr64_t)(uintptr_t)debug_buf_ptr); + debug_buf_size = sizeof(debug_buf); } #if __i386__ @@ -397,6 +401,13 @@ static void panic_display_model_name(void) { kdb_printf("System model name: %s\n", model_name); } +static void panic_display_uptime(void) { + uint64_t uptime; + absolutetime_to_nanoseconds(mach_absolute_time(), &uptime); + + kdb_printf("\nSystem uptime in nanoseconds: %llu\n", uptime); +} + extern const char version[]; extern char osversion[]; @@ -409,10 +420,54 @@ __private_extern__ void panic_display_system_configuration(void) { (osversion[0] != 0) ? osversion : "Not yet set"); kdb_printf("\nKernel version:\n%s\n",version); panic_display_model_name(); + panic_display_uptime(); config_displayed = TRUE; } } +extern zone_t first_zone; +extern unsigned int num_zones, stack_total; + +#if defined(__i386__) +extern unsigned int inuse_ptepages_count; +#endif + +extern boolean_t panic_include_zprint; +extern vm_size_t kalloc_large_total; + +__private_extern__ void panic_display_zprint() +{ + if(panic_include_zprint == TRUE) { + + unsigned int i; + struct zone zone_copy; + + if(first_zone!=NULL) { + if(ml_nofault_copy((vm_offset_t)first_zone, (vm_offset_t)&zone_copy, sizeof(struct zone)) == sizeof(struct zone)) { + for (i = 0; i < num_zones; i++) { + if(zone_copy.cur_size > (1024*1024)) { + kdb_printf("%.20s:%lu\n",zone_copy.zone_name,(uintptr_t)zone_copy.cur_size); + } + + if(zone_copy.next_zone == NULL) { + break; + } + + if(ml_nofault_copy((vm_offset_t)zone_copy.next_zone, (vm_offset_t)&zone_copy, sizeof(struct zone)) != sizeof(struct zone)) { + break; + } + } + } + } + + kdb_printf("Kernel Stacks:%lu\n",(uintptr_t)(KERNEL_STACK_SIZE * stack_total)); +#if defined(__i386__) + kdb_printf("PageTables:%lu\n",(uintptr_t)(PAGE_SIZE * inuse_ptepages_count)); +#endif + kdb_printf("Kalloc.Large:%lu\n",(uintptr_t)kalloc_large_total); + } +} + #if !MACH_KDP static struct ether_addr kdp_current_mac_address = {{0, 0, 0, 0, 0, 0}}; unsigned int not_in_kdp = 1; diff --git a/osfmk/kern/debug.h b/osfmk/kern/debug.h index e861592e6..cdf94989f 100644 --- a/osfmk/kern/debug.h +++ b/osfmk/kern/debug.h @@ -84,6 +84,7 @@ int packA(char *inbuf, uint32_t length, uint32_t buflen); void unpackA(char *inbuf, uint32_t length); void panic_display_system_configuration(void); +void panic_display_zprint(void); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/hibernate.c b/osfmk/kern/hibernate.c index 3decaefe2..27a089239 100644 --- a/osfmk/kern/hibernate.c +++ b/osfmk/kern/hibernate.c @@ -189,11 +189,13 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, uint64_t start, end, nsec; vm_page_t m; uint32_t pages = page_list->page_count; - uint32_t count_zf = 0, count_throttled = 0, count_inactive = 0, count_active = 0; + uint32_t count_zf = 0, count_throttled = 0; + uint32_t count_inactive = 0, count_active = 0, count_speculative = 0; uint32_t count_wire = pages; uint32_t count_discard_active = 0; uint32_t count_discard_inactive = 0; uint32_t count_discard_purgeable = 0; + uint32_t count_discard_speculative = 0; uint32_t i; uint32_t bank; hibernate_bitmap_t * bitmap; @@ -262,7 +264,7 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, queue_iterate( &vm_page_queue_zf, m, vm_page_t, - pageq ) + pageq ) { if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) && consider_discard(m)) @@ -299,6 +301,26 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); } + for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) + { + queue_iterate(&vm_page_queue_speculative[i].age_q, + m, + vm_page_t, + pageq) + { + if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) + && consider_discard(m)) + { + hibernate_page_bitset(page_list, TRUE, m->phys_page); + count_discard_speculative++; + } + else + count_speculative++; + count_wire--; + hibernate_page_bitset(page_list_wired, TRUE, m->phys_page); + } + } + queue_iterate( &vm_page_queue_active, m, vm_page_t, @@ -338,11 +360,11 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list, absolutetime_to_nanoseconds(end - start, &nsec); HIBLOG("hibernate_page_list_setall time: %qd ms\n", nsec / 1000000ULL); - HIBLOG("pages %d, wire %d, act %d, inact %d, zf %d, throt %d, could discard act %d inact %d purgeable %d\n", - pages, count_wire, count_active, count_inactive, count_zf, count_throttled, - count_discard_active, count_discard_inactive, count_discard_purgeable); + HIBLOG("pages %d, wire %d, act %d, inact %d, spec %d, zf %d, throt %d, could discard act %d inact %d purgeable %d spec %d\n", + pages, count_wire, count_active, count_inactive, count_speculative, count_zf, count_throttled, + count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative); - *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable; + *pagesOut = pages - count_discard_active - count_discard_inactive - count_discard_purgeable - count_discard_speculative; } void @@ -351,9 +373,11 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) uint64_t start, end, nsec; vm_page_t m; vm_page_t next; + uint32_t i; uint32_t count_discard_active = 0; uint32_t count_discard_inactive = 0; uint32_t count_discard_purgeable = 0; + uint32_t count_discard_speculative = 0; clock_get_uptime(&start); @@ -372,6 +396,21 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) m = next; } + for( i = 0; i <= VM_PAGE_MAX_SPECULATIVE_AGE_Q; i++ ) + { + m = (vm_page_t) queue_first(&vm_page_queue_speculative[i].age_q); + while (m && !queue_end(&vm_page_queue_speculative[i].age_q, (queue_entry_t)m)) + { + next = (vm_page_t) m->pageq.next; + if (hibernate_page_bittst(page_list, m->phys_page)) + { + count_discard_speculative++; + discard_page(m); + } + m = next; + } + } + m = (vm_page_t) queue_first(&vm_page_queue_inactive); while (m && !queue_end(&vm_page_queue_inactive, (queue_entry_t)m)) { @@ -404,9 +443,9 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list) clock_get_uptime(&end); absolutetime_to_nanoseconds(end - start, &nsec); - HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d\n", + HIBLOG("hibernate_page_list_discard time: %qd ms, discarded act %d inact %d purgeable %d spec %d\n", nsec / 1000000ULL, - count_discard_active, count_discard_inactive, count_discard_purgeable); + count_discard_active, count_discard_inactive, count_discard_purgeable, count_discard_speculative); } /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ diff --git a/osfmk/kern/host.c b/osfmk/kern/host.c index 010ac2887..0f8de2841 100644 --- a/osfmk/kern/host.c +++ b/osfmk/kern/host.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -153,7 +153,7 @@ host_info( case HOST_BASIC_INFO: { register host_basic_info_t basic_info; - register int master_slot; + register int master_num; /* * Basic information about this host. @@ -166,12 +166,12 @@ host_info( basic_info->memory_size = machine_info.memory_size; basic_info->max_cpus = machine_info.max_cpus; basic_info->avail_cpus = processor_avail_count; - master_slot = PROCESSOR_DATA(master_processor, slot_num); - basic_info->cpu_type = slot_type(master_slot); - basic_info->cpu_subtype = slot_subtype(master_slot); + master_num = master_processor->cpu_num; + basic_info->cpu_type = slot_type(master_num); + basic_info->cpu_subtype = slot_subtype(master_num); if (*count >= HOST_BASIC_INFO_COUNT) { - basic_info->cpu_threadtype = slot_threadtype(master_slot); + basic_info->cpu_threadtype = slot_threadtype(master_num); basic_info->physical_cpu = machine_info.physical_cpu; basic_info->physical_cpu_max = machine_info.physical_cpu_max; basic_info->logical_cpu = machine_info.logical_cpu; diff --git a/osfmk/kern/ipc_mig.c b/osfmk/kern/ipc_mig.c index 3b2ae194b..0dbe02f08 100644 --- a/osfmk/kern/ipc_mig.c +++ b/osfmk/kern/ipc_mig.c @@ -93,8 +93,9 @@ * Nothing locked. * Returns: * MACH_MSG_SUCCESS Sent the message. - * MACH_MSG_SEND_NO_BUFFER Destination port had inuse fixed bufer * MACH_SEND_INVALID_DEST Bad destination port. + * MACH_MSG_SEND_NO_BUFFER Destination port had inuse fixed bufer + * or destination is above kernel limit */ mach_msg_return_t @@ -113,9 +114,13 @@ mach_msg_send_from_kernel( return mr; ipc_kmsg_copyin_from_kernel(kmsg); - ipc_kmsg_send_always(kmsg); - return MACH_MSG_SUCCESS; + mr = ipc_kmsg_send_always(kmsg); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_destroy(kmsg); + } + + return mr; } mach_msg_return_t @@ -138,7 +143,7 @@ mach_msg_send_from_kernel_with_options( ipc_kmsg_copyin_from_kernel(kmsg); mr = ipc_kmsg_send(kmsg, option, timeout_val); if (mr != MACH_MSG_SUCCESS) { - ipc_kmsg_free(kmsg); + ipc_kmsg_destroy(kmsg); } return mr; @@ -196,7 +201,11 @@ mach_msg_rpc_from_kernel( ipc_kmsg_copyin_from_kernel(kmsg); - ipc_kmsg_send_always(kmsg); + mr = ipc_kmsg_send_always(kmsg); + if (mr != MACH_MSG_SUCCESS) { + ipc_kmsg_destroy(kmsg); + return mr; + } for (;;) { ipc_mqueue_t mqueue; diff --git a/osfmk/kern/kmod.c b/osfmk/kern/kmod.c index f30d897e2..1feb3688e 100644 --- a/osfmk/kern/kmod.c +++ b/osfmk/kern/kmod.c @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -55,6 +56,8 @@ #include #include +#include + /* * XXX headers for which prototypes should be in a common include file; * XXX see libsa/kext.cpp for why. @@ -99,6 +102,377 @@ typedef struct cmd_queue_entry { queue_head_t kmod_cmd_queue; +/******************************************************************************* +*******************************************************************************/ +#define KMOD_PANICLIST_SIZE (2 * PAGE_SIZE) + +char * unloaded_kext_paniclist = NULL; +uint32_t unloaded_kext_paniclist_size = 0; +uint32_t unloaded_kext_paniclist_length = 0; +uint64_t last_loaded_timestamp = 0; + +char * loaded_kext_paniclist = NULL; +uint32_t loaded_kext_paniclist_size = 0; +uint32_t loaded_kext_paniclist_length = 0; +uint64_t last_unloaded_timestamp = 0; + +int substitute( + const char * scan_string, + char * string_out, + uint32_t * to_index, + uint32_t * from_index, + const char * substring, + char marker, + char substitution); + +/* identifier_out must be at least KMOD_MAX_NAME bytes. + */ +int substitute( + const char * scan_string, + char * string_out, + uint32_t * to_index, + uint32_t * from_index, + const char * substring, + char marker, + char substitution) +{ + uint32_t substring_length = strnlen(substring, KMOD_MAX_NAME - 1); + + if (!strncmp(scan_string, substring, substring_length)) { + if (marker) { + string_out[(*to_index)++] = marker; + } + string_out[(*to_index)++] = substitution; + (*from_index) += substring_length; + return 1; + } + return 0; +} + +void compactIdentifier( + const char * identifier, + char * identifier_out, + char ** identifier_out_end); + +void compactIdentifier( + const char * identifier, + char * identifier_out, + char ** identifier_out_end) +{ + uint32_t from_index, to_index; + uint32_t scan_from_index = 0; + uint32_t scan_to_index = 0; + subs_entry_t * subs_entry = NULL; + int did_sub = 0; + + from_index = to_index = 0; + identifier_out[0] = '\0'; + + /* Replace certain identifier prefixes with shorter @+character sequences. + */ + for (subs_entry = &kext_identifier_prefix_subs[0]; + subs_entry->substring && !did_sub; + subs_entry++) { + + did_sub = substitute(identifier, identifier_out, + &scan_to_index, &scan_from_index, + subs_entry->substring, /* marker */ '\0', subs_entry->substitute); + } + did_sub = 0; + + /* Now scan through the identifier looking for the common substrings + * and replacing them with shorter !+character sequences. + */ + for (/* see above */; + scan_from_index < KMOD_MAX_NAME - 1 && identifier[scan_from_index]; + /* see loop */) { + + const char * scan_string = &identifier[scan_from_index]; + + did_sub = 0; + + if (scan_from_index) { + for (subs_entry = &kext_identifier_substring_subs[0]; + subs_entry->substring && !did_sub; + subs_entry++) { + + did_sub = substitute(scan_string, identifier_out, + &scan_to_index, &scan_from_index, + subs_entry->substring, '!', subs_entry->substitute); + } + } + + if (!did_sub) { + identifier_out[scan_to_index++] = identifier[scan_from_index++]; + } + } + + identifier_out[scan_to_index] = '\0'; + if (identifier_out_end) { + *identifier_out_end = &identifier_out[scan_to_index]; + } + + return; +} + +/* identPlusVers must be at least 2*KMOD_MAX_NAME in length. + */ +int assemble_identifier_and_version( + kmod_info_t * kmod_info, + char * identPlusVers); +int assemble_identifier_and_version( + kmod_info_t * kmod_info, + char * identPlusVers) +{ + int result = 0; + + compactIdentifier(kmod_info->name, identPlusVers, NULL); + result = strnlen(identPlusVers, KMOD_MAX_NAME - 1); + identPlusVers[result++] = '\t'; // increment for real char + identPlusVers[result] = '\0'; // don't increment for nul char + result = strlcat(identPlusVers, kmod_info->version, KMOD_MAX_NAME); + + return result; +} + +#define LAST_LOADED " - last loaded " +#define LAST_LOADED_TS_WIDTH (16) + +uint32_t save_loaded_kext_paniclist_typed( + const char * prefix, + int invertFlag, + int libsFlag, + char * paniclist, + uint32_t list_size, + uint32_t * list_length_ptr, + int (*printf_func)(const char *fmt, ...)); +uint32_t save_loaded_kext_paniclist_typed( + const char * prefix, + int invertFlag, + int libsFlag, + char * paniclist, + uint32_t list_size, + uint32_t * list_length_ptr, + int (*printf_func)(const char *fmt, ...)) +{ + uint32_t result = 0; + int error = 0; + kmod_info_t * kmod_info; + + for (kmod_info = kmod; + kmod_info && (*list_length_ptr + 1 < list_size); + kmod_info = kmod_info->next) { + + int match; + char identPlusVers[2*KMOD_MAX_NAME]; + uint32_t identPlusVersLength; + char timestampBuffer[17]; // enough for a uint64_t + + if (!pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)kmod_info))) { + (*printf_func)("kmod scan stopped due to missing kmod page: %p\n", + kmod_info); + error = 1; + goto finish; + } + + /* Skip all built-in/fake entries. + */ + if (!kmod_info->address) { + continue; + } + + /* Filter for kmod name (bundle identifier). + */ + match = !strncmp(kmod_info->name, prefix, strnlen(prefix, KMOD_MAX_NAME)); + if ((match && invertFlag) || (!match && !invertFlag)) { + continue; + } + + /* Filter for libraries. This isn't a strictly correct check, + * but any kext that does have references to it has to be a library. + * A kext w/o references may or may not be a library. + */ + if ((libsFlag == 0 && kmod_info->reference_count) || + (libsFlag == 1 && !kmod_info->reference_count)) { + + continue; + } + + identPlusVersLength = assemble_identifier_and_version(kmod_info, + identPlusVers); + if (!identPlusVersLength) { + printf_func("error saving loaded kext info\n"); + goto finish; + } + + /* We're going to note the last-loaded kext in the list. + */ + if (kmod_info == kmod) { + snprintf(timestampBuffer, sizeof(timestampBuffer), "%llu", + last_loaded_timestamp); + identPlusVersLength += sizeof(LAST_LOADED) - 1 + + strnlen(timestampBuffer, sizeof(timestampBuffer)); + } + + /* Adding 1 for the newline. + */ + if (*list_length_ptr + identPlusVersLength + 1 >= list_size) { + goto finish; + } + + *list_length_ptr = strlcat(paniclist, identPlusVers, list_size); + if (kmod_info == kmod) { + *list_length_ptr = strlcat(paniclist, LAST_LOADED, list_size); + *list_length_ptr = strlcat(paniclist, timestampBuffer, list_size); + } + *list_length_ptr = strlcat(paniclist, "\n", list_size); + } + +finish: + if (!error) { + if (*list_length_ptr + 1 <= list_size) { + result = list_size - (*list_length_ptr + 1); + } + } + + return result; +} + +void save_loaded_kext_paniclist( + int (*printf_func)(const char *fmt, ...)); + +void save_loaded_kext_paniclist( + int (*printf_func)(const char *fmt, ...)) +{ + char * newlist = NULL; + uint32_t newlist_size = 0; + uint32_t newlist_length = 0; + + newlist_length = 0; + newlist_size = KMOD_PANICLIST_SIZE; + newlist = (char *)kalloc(newlist_size); + + if (!newlist) { + printf_func("couldn't allocate kext panic log buffer\n"); + goto finish; + } + + newlist[0] = '\0'; + + // non-"com.apple." kexts + if (!save_loaded_kext_paniclist_typed("com.apple.", /* invert? */ 1, + /* libs? */ -1, newlist, newlist_size, &newlist_length, + printf_func)) { + + goto finish; + } + // "com.apple." nonlibrary kexts + if (!save_loaded_kext_paniclist_typed("com.apple.", /* invert? */ 0, + /* libs? */ 0, newlist, newlist_size, &newlist_length, + printf_func)) { + + goto finish; + } + // "com.apple." library kexts + if (!save_loaded_kext_paniclist_typed("com.apple.", /* invert? */ 0, + /* libs? */ 1, newlist, newlist_size, &newlist_length, + printf_func)) { + + goto finish; + } + + if (loaded_kext_paniclist) { + kfree(loaded_kext_paniclist, loaded_kext_paniclist_size); + } + loaded_kext_paniclist = newlist; + loaded_kext_paniclist_size = newlist_size; + loaded_kext_paniclist_length = newlist_length; + +finish: + return; +} + +void save_unloaded_kext_paniclist( + kmod_info_t * kmod_info, + int (*printf_func)(const char *fmt, ...)); +void save_unloaded_kext_paniclist( + kmod_info_t * kmod_info, + int (*printf_func)(const char *fmt, ...)) +{ + char * newlist = NULL; + uint32_t newlist_size = 0; + uint32_t newlist_length = 0; + char identPlusVers[2*KMOD_MAX_NAME]; + uint32_t identPlusVersLength; + + identPlusVersLength = assemble_identifier_and_version(kmod_info, + identPlusVers); + if (!identPlusVersLength) { + printf_func("error saving unloaded kext info\n"); + goto finish; + } + + newlist_length = identPlusVersLength; + newlist_size = newlist_length + 1; + newlist = (char *)kalloc(newlist_size); + + if (!newlist) { + printf_func("couldn't allocate kext panic log buffer\n"); + goto finish; + } + + newlist[0] = '\0'; + + strlcpy(newlist, identPlusVers, newlist_size); + + if (unloaded_kext_paniclist) { + kfree(unloaded_kext_paniclist, unloaded_kext_paniclist_size); + } + unloaded_kext_paniclist = newlist; + unloaded_kext_paniclist_size = newlist_size; + unloaded_kext_paniclist_length = newlist_length; + +finish: + return; +} + +// proto is in header +void record_kext_unload(kmod_t kmod_id) +{ + kmod_info_t * kmod_info = NULL; + + mutex_lock(kmod_lock); + + kmod_info = kmod_lookupbyid(kmod_id); + if (kmod_info) { + clock_get_uptime(&last_unloaded_timestamp); + save_unloaded_kext_paniclist(kmod_info, &printf); + } + mutex_unlock(kmod_lock); + return; +} + +void dump_kext_info(int (*printf_func)(const char *fmt, ...)) +{ + printf_func("unloaded kexts:\n"); + if (unloaded_kext_paniclist && (pmap_find_phys(kernel_pmap, (addr64_t) (uintptr_t) unloaded_kext_paniclist))) { + printf_func("%.*s - last unloaded %llu\n", + unloaded_kext_paniclist_length, unloaded_kext_paniclist, + last_unloaded_timestamp); + } else { + printf_func("(none)\n"); + } + printf_func("loaded kexts:\n"); + if (loaded_kext_paniclist && (pmap_find_phys(kernel_pmap, (addr64_t) (uintptr_t) loaded_kext_paniclist)) && loaded_kext_paniclist[0]) { + printf_func("%.*s", loaded_kext_paniclist_length, loaded_kext_paniclist); + } else { + printf_func("(none)\n"); + } + return; +} + +/******************************************************************************* +*******************************************************************************/ void kmod_init(void) { @@ -141,27 +515,27 @@ kmod_lookupbyname(const char * name) int kmod_lookupidbyaddress_locked(vm_address_t addr) { kmod_info_t *k = 0; - + mutex_lock(kmod_queue_lock); k = kmod; - if(NULL != k) { - while (k) { - if ((k->address <= addr) && ((k->address + k->size) > addr)) { - break; - } - k = k->next; - } - mutex_unlock(kmod_queue_lock); - } else { - mutex_unlock(kmod_queue_lock); - return -1; - } - - if(NULL == k) { - return -1; - } else { - return k->id; - } + if(NULL != k) { + while (k) { + if ((k->address <= addr) && ((k->address + k->size) > addr)) { + break; + } + k = k->next; + } + mutex_unlock(kmod_queue_lock); + } else { + mutex_unlock(kmod_queue_lock); + return -1; + } + + if(NULL == k) { + return -1; + } else { + return k->id; + } } kmod_info_t * @@ -387,6 +761,9 @@ kmod_create_internal(kmod_info_t *info, kmod_t *id) *id = info->id; + clock_get_uptime(&last_loaded_timestamp); + save_loaded_kext_paniclist(&printf); + mutex_unlock(kmod_lock); #if DEBUG @@ -542,6 +919,10 @@ _kmod_destroy_internal(kmod_t id, boolean_t fake) k = k->next; } + if (!fake) { + save_loaded_kext_paniclist(&printf); + } + mutex_unlock(kmod_lock); return KERN_INVALID_ARGUMENT; diff --git a/osfmk/kern/machine.c b/osfmk/kern/machine.c index 0fa0930c7..898dd3bfa 100644 --- a/osfmk/kern/machine.c +++ b/osfmk/kern/machine.c @@ -121,7 +121,7 @@ processor_up( pset = processor->processor_set; pset_lock(pset); if (++pset->processor_count == 1) - pset->low_pri = processor; + pset->low_pri = pset->low_count = processor; enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->state = PROCESSOR_RUNNING; (void)hw_atomic_add(&processor_avail_count, 1); @@ -213,15 +213,11 @@ processor_shutdown( return (KERN_SUCCESS); } - if (processor->state == PROCESSOR_IDLE) { + if (processor->state == PROCESSOR_IDLE) remqueue(&pset->idle_queue, (queue_entry_t)processor); - pset->idle_count--; - } else if (processor->state == PROCESSOR_RUNNING) remqueue(&pset->active_queue, (queue_entry_t)processor); - else - panic("processor_shutdown"); processor->state = PROCESSOR_SHUTDOWN; @@ -230,7 +226,7 @@ processor_shutdown( processor_doshutdown(processor); splx(s); - cpu_exit_wait(PROCESSOR_DATA(processor, slot_num)); + cpu_exit_wait(processor->cpu_num); return (KERN_SUCCESS); } @@ -270,24 +266,6 @@ processor_doshutdown( old_thread = machine_processor_shutdown(self, processor_offline, processor); thread_dispatch(old_thread, self); - - /* - * If we just shutdown another processor, move any - * threads and timer call outs to the current processor. - */ - if (processor != current_processor()) { - processor_set_t pset = processor->processor_set; - - pset_lock(pset); - - if (processor->state == PROCESSOR_OFF_LINE || processor->state == PROCESSOR_SHUTDOWN) { - timer_call_shutdown(processor); - processor_queue_shutdown(processor); - return; - } - - pset_unlock(pset); - } } /* @@ -315,15 +293,17 @@ processor_offline( thread_dispatch(old_thread, new_thread); - PMAP_DEACTIVATE_KERNEL(PROCESSOR_DATA(processor, slot_num)); + PMAP_DEACTIVATE_KERNEL(processor->cpu_num); pset = processor->processor_set; pset_lock(pset); processor->state = PROCESSOR_OFF_LINE; if (--pset->processor_count == 0) - pset->low_pri = PROCESSOR_NULL; + pset->low_pri = pset->low_count = PROCESSOR_NULL; (void)hw_atomic_sub(&processor_avail_count, 1); - pset_unlock(pset); + processor_queue_shutdown(processor); + /* pset lock dropped */ + ml_cpu_down(); cpu_sleep(); diff --git a/osfmk/kern/misc_protos.h b/osfmk/kern/misc_protos.h index 6d2ceb898..3590d3c45 100644 --- a/osfmk/kern/misc_protos.h +++ b/osfmk/kern/misc_protos.h @@ -122,6 +122,8 @@ extern void dbugprintf(const char *format, ...) __printflike(1,2); extern int kdb_printf(const char *format, ...) __printflike(1,2); +extern int kdb_log(const char *format, ...) __printflike(1,2); + extern void printf_init(void); extern int snprintf(char *, size_t, const char *, ...) __printflike(3,4); @@ -152,6 +154,8 @@ extern void conslog_putc(char); extern void consdebug_putc(char); +extern void consdebug_log(char); + extern void cnputc(char); extern int cngetc(void); diff --git a/osfmk/kern/printf.c b/osfmk/kern/printf.c index 1f7015c87..f8376b419 100644 --- a/osfmk/kern/printf.c +++ b/osfmk/kern/printf.c @@ -796,6 +796,13 @@ consdebug_putc(char c) PE_kputc(c); } + +void +consdebug_log(char c) +{ + debug_putc(c); +} + int kdb_printf(const char *fmt, ...) { @@ -807,6 +814,17 @@ kdb_printf(const char *fmt, ...) return 0; } +int +kdb_log(const char *fmt, ...) +{ + va_list listp; + + va_start(listp, fmt); + _doprnt(fmt, &listp, consdebug_log, 16); + va_end(listp); + return 0; +} + static void copybyte(int c, void *arg) { diff --git a/osfmk/kern/priority.c b/osfmk/kern/priority.c index 8ee162fa3..bc0e89a5c 100644 --- a/osfmk/kern/priority.c +++ b/osfmk/kern/priority.c @@ -165,7 +165,7 @@ thread_quantum_expire( /* * Context switch check. */ - if ((preempt = csw_check(thread, processor)) != AST_NONE) + if ((preempt = csw_check(processor)) != AST_NONE) ast_on(preempt); else { processor_set_t pset = processor->processor_set; @@ -173,6 +173,7 @@ thread_quantum_expire( pset_lock(pset); pset_pri_hint(pset, processor, processor->current_pri); + pset_count_hint(pset, processor, processor->runq.count); pset_unlock(pset); } diff --git a/osfmk/kern/processor.c b/osfmk/kern/processor.c index 0a413bcf1..9436505b3 100644 --- a/osfmk/kern/processor.c +++ b/osfmk/kern/processor.c @@ -132,36 +132,36 @@ processor_bootstrap(void) /* * Initialize the given processor for the cpu - * indicated by slot_num, and assign to the + * indicated by cpu_num, and assign to the * specified processor set. */ void processor_init( - processor_t p, - int slot_num, - processor_set_t pset) + processor_t processor, + int cpu_num, + processor_set_t pset) { - run_queue_init(&p->runq); - - p->state = PROCESSOR_OFF_LINE; - p->active_thread = p->next_thread = p->idle_thread = THREAD_NULL; - p->processor_set = pset; - p->current_pri = MINPRI; - timer_call_setup(&p->quantum_timer, thread_quantum_expire, p); - p->deadline = UINT64_MAX; - p->timeslice = 0; - p->processor_self = IP_NULL; - simple_lock_init(&p->lock, 0); - processor_data_init(p); - PROCESSOR_DATA(p, slot_num) = slot_num; - p->processor_list = NULL; + run_queue_init(&processor->runq); + + processor->state = PROCESSOR_OFF_LINE; + processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL; + processor->processor_set = pset; + processor->current_pri = MINPRI; + processor->cpu_num = cpu_num; + timer_call_setup(&processor->quantum_timer, thread_quantum_expire, processor); + processor->deadline = UINT64_MAX; + processor->timeslice = 0; + processor->processor_self = IP_NULL; + simple_lock_init(&processor->lock, 0); + processor_data_init(processor); + processor->processor_list = NULL; simple_lock(&processor_list_lock); if (processor_list == NULL) - processor_list = p; + processor_list = processor; else - processor_list_tail->processor_list = p; - processor_list_tail = p; + processor_list_tail->processor_list = processor; + processor_list_tail = processor; processor_count++; simple_unlock(&processor_list_lock); } @@ -212,9 +212,8 @@ pset_init( { queue_init(&pset->active_queue); queue_init(&pset->idle_queue); - pset->idle_count = 0; pset->processor_count = 0; - pset->low_pri = PROCESSOR_NULL; + pset->low_pri = pset->low_count = PROCESSOR_NULL; pset_lock_init(pset); pset->pset_self = IP_NULL; pset->pset_name_self = IP_NULL; @@ -253,13 +252,13 @@ processor_info( processor_info_t info, mach_msg_type_number_t *count) { - register int slot_num, state; + register int cpu_num, state; kern_return_t result; if (processor == PROCESSOR_NULL) return (KERN_INVALID_ARGUMENT); - slot_num = PROCESSOR_DATA(processor, slot_num); + cpu_num = processor->cpu_num; switch (flavor) { @@ -271,14 +270,14 @@ processor_info( return (KERN_FAILURE); basic_info = (processor_basic_info_t) info; - basic_info->cpu_type = slot_type(slot_num); - basic_info->cpu_subtype = slot_subtype(slot_num); + basic_info->cpu_type = slot_type(cpu_num); + basic_info->cpu_subtype = slot_subtype(cpu_num); state = processor->state; if (state == PROCESSOR_OFF_LINE) basic_info->running = FALSE; else basic_info->running = TRUE; - basic_info->slot_num = slot_num; + basic_info->slot_num = cpu_num; if (processor == master_processor) basic_info->is_master = TRUE; else @@ -313,7 +312,7 @@ processor_info( } default: - result = cpu_info(flavor, slot_num, info, count); + result = cpu_info(flavor, cpu_num, info, count); if (result == KERN_SUCCESS) *host = &realhost; @@ -339,7 +338,7 @@ processor_start( prev = thread_bind(processor); thread_block(THREAD_CONTINUE_NULL); - result = cpu_start(PROCESSOR_DATA(processor, slot_num)); + result = cpu_start(processor->cpu_num); thread_bind(prev); @@ -408,12 +407,11 @@ processor_start( if (processor->processor_self == IP_NULL) ipc_processor_init(processor); - result = cpu_start(PROCESSOR_DATA(processor, slot_num)); + result = cpu_start(processor->cpu_num); if (result != KERN_SUCCESS) { s = splsched(); pset_lock(pset); processor->state = PROCESSOR_OFF_LINE; - timer_call_shutdown(processor); pset_unlock(pset); splx(s); @@ -444,7 +442,7 @@ processor_control( if (processor == PROCESSOR_NULL) return(KERN_INVALID_ARGUMENT); - return(cpu_control(PROCESSOR_DATA(processor, slot_num), info, count)); + return(cpu_control(processor->cpu_num, info, count)); } kern_return_t diff --git a/osfmk/kern/processor.h b/osfmk/kern/processor.h index 5cb479cd1..24603cc11 100644 --- a/osfmk/kern/processor.h +++ b/osfmk/kern/processor.h @@ -84,9 +84,8 @@ struct processor_set { queue_head_t active_queue; /* active processors */ queue_head_t idle_queue; /* idle processors */ - int idle_count; - processor_t low_pri; + processor_t low_pri, low_count; int processor_count; @@ -128,6 +127,7 @@ struct processor { processor_set_t processor_set; /* assigned set */ int current_pri; /* priority of current thread */ + int cpu_num; /* platform numeric id */ timer_call_data_t quantum_timer; /* timer for quantum expiration */ uint64_t quantum_end; /* time when current quantum ends */ @@ -149,7 +149,9 @@ extern processor_t processor_list; extern unsigned int processor_count; decl_simple_lock_data(extern,processor_list_lock) -extern processor_t master_processor; +extern uint32_t processor_avail_count; + +extern processor_t master_processor; /* * Processor state is accessed by locking the scheduling lock @@ -158,9 +160,10 @@ extern processor_t master_processor; #define PROCESSOR_OFF_LINE 0 /* Not available */ #define PROCESSOR_SHUTDOWN 1 /* Going off-line */ #define PROCESSOR_START 2 /* Being started */ -#define PROCESSOR_IDLE 3 /* Idle */ -#define PROCESSOR_DISPATCHING 4 /* Dispatching (idle -> running) */ -#define PROCESSOR_RUNNING 5 /* Normal execution */ +#define PROCESSOR_INACTIVE 3 /* Inactive (unavailable) */ +#define PROCESSOR_IDLE 4 /* Idle (available) */ +#define PROCESSOR_DISPATCHING 5 /* Dispatching (idle -> active) */ +#define PROCESSOR_RUNNING 6 /* Normal execution */ extern processor_t current_processor(void); @@ -184,6 +187,20 @@ MACRO_BEGIN \ if ((p) != (ps)->low_pri) { \ if ((pri) < (ps)->low_pri->current_pri) \ (ps)->low_pri = (p); \ + else \ + if ((ps)->low_pri->state < PROCESSOR_IDLE) \ + (ps)->low_pri = (p); \ + } \ +MACRO_END + +#define pset_count_hint(ps, p, cnt) \ +MACRO_BEGIN \ + if ((p) != (ps)->low_count) { \ + if ((cnt) < (ps)->low_count->runq.count) \ + (ps)->low_count = (p); \ + else \ + if ((ps)->low_count->state < PROCESSOR_IDLE) \ + (ps)->low_count = (p); \ } \ MACRO_END @@ -191,7 +208,7 @@ extern void processor_bootstrap(void) __attribute__((section("__TEXT, initcode" extern void processor_init( processor_t processor, - int slot_num, + int cpu_num, processor_set_t processor_set) __attribute__((section("__TEXT, initcode"))); extern kern_return_t processor_shutdown( @@ -219,6 +236,12 @@ extern kern_return_t processor_info_count( #define pset_deallocate(x) #define pset_reference(x) +extern void machine_run_count( + uint32_t count); + +extern boolean_t machine_cpu_is_inactive( + int num); + #else /* MACH_KERNEL_PRIVATE */ __BEGIN_DECLS @@ -233,9 +256,4 @@ __END_DECLS #endif /* MACH_KERNEL_PRIVATE */ -#ifdef XNU_KERNEL_PRIVATE - -extern uint32_t processor_avail_count; - -#endif #endif /* _KERN_PROCESSOR_H_ */ diff --git a/osfmk/kern/processor_data.c b/osfmk/kern/processor_data.c index 41031a8fe..9f81a2d18 100644 --- a/osfmk/kern/processor_data.c +++ b/osfmk/kern/processor_data.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -43,6 +43,4 @@ processor_data_init( timer_init(&PROCESSOR_DATA(processor, idle_state)); timer_init(&PROCESSOR_DATA(processor, system_state)); timer_init(&PROCESSOR_DATA(processor, user_state)); - - queue_init(&PROCESSOR_DATA(processor, timer_call_queue)); } diff --git a/osfmk/kern/processor_data.h b/osfmk/kern/processor_data.h index 4debe720d..0e3f64705 100644 --- a/osfmk/kern/processor_data.h +++ b/osfmk/kern/processor_data.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2003-2007 Apple Inc. All rights reserved. + * Copyright (c) 2003-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -59,9 +59,6 @@ struct processor_data { unsigned int count; } stack_cache; - /* Pending timer callouts */ - queue_head_t timer_call_queue; - /* VM event counters */ vm_statistics_data_t vm_stat; @@ -72,8 +69,6 @@ struct processor_data { unsigned int avail; } ikm_cache; - int slot_num; - unsigned long page_grab_count; int start_color; void *free_pages; diff --git a/osfmk/kern/sched.h b/osfmk/kern/sched.h index 2e7018ef9..088e84c3d 100644 --- a/osfmk/kern/sched.h +++ b/osfmk/kern/sched.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -185,15 +185,6 @@ MACRO_BEGIN \ (thread)->realtime.computation: std_quantum; \ MACRO_END -/* Invoked at splsched by a thread on itself */ -#define csw_needed(thread, processor) ( \ - ((thread)->state & TH_SUSP) || \ - (first_timeslice(processor)? \ - ((processor)->runq.highq > (thread)->sched_pri || \ - rt_runq.highq > (thread)->sched_pri) : \ - ((processor)->runq.highq >= (thread)->sched_pri || \ - rt_runq.highq >= (thread)->sched_pri)) ) - extern struct run_queue rt_runq; /* @@ -209,10 +200,8 @@ extern void thread_quantum_expire( timer_call_param_t processor, timer_call_param_t thread); -/* Called at splsched by a thread on itself */ -extern ast_t csw_check( - thread_t thread, - processor_t processor); +/* Context switch check for current processor */ +extern ast_t csw_check(processor_t processor); extern uint32_t std_quantum, min_std_quantum; extern uint32_t std_quantum_us; @@ -258,16 +247,24 @@ extern uint64_t max_unsafe_computation; extern uint64_t max_poll_computation; #define sched_run_incr() \ - (void)hw_atomic_add(&sched_run_count, 1) +MACRO_BEGIN \ + machine_run_count(hw_atomic_add(&sched_run_count, 1)); \ +MACRO_END #define sched_run_decr() \ - (void)hw_atomic_sub(&sched_run_count, 1) +MACRO_BEGIN \ + machine_run_count(hw_atomic_sub(&sched_run_count, 1)); \ +MACRO_END #define sched_share_incr() \ - (void)hw_atomic_add(&sched_share_count, 1) +MACRO_BEGIN \ + (void)hw_atomic_add(&sched_share_count, 1); \ +MACRO_END #define sched_share_decr() \ - (void)hw_atomic_sub(&sched_share_count, 1) +MACRO_BEGIN \ + (void)hw_atomic_sub(&sched_share_count, 1); \ +MACRO_END /* * thread_timer_delta macro takes care of both thread timers. diff --git a/osfmk/kern/sched_prim.c b/osfmk/kern/sched_prim.c index a12449033..4e281607d 100644 --- a/osfmk/kern/sched_prim.c +++ b/osfmk/kern/sched_prim.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2008 Apple Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -150,6 +150,10 @@ void wait_queues_init(void) __attribute__((section("__TEXT, initcode"))); static void load_shift_init(void) __attribute__((section("__TEXT, initcode"))); static void preempt_pri_init(void) __attribute__((section("__TEXT, initcode"))); +static thread_t run_queue_dequeue( + run_queue_t runq, + integer_t options); + static thread_t thread_select_idle( thread_t thread, processor_t processor); @@ -158,9 +162,6 @@ static thread_t processor_idle( thread_t thread, processor_t processor); -static thread_t choose_thread( - processor_t processor); - static thread_t steal_thread( processor_set_t pset); @@ -1170,7 +1171,7 @@ thread_select( { processor_set_t pset = processor->processor_set; thread_t new_thread = THREAD_NULL; - boolean_t other_runnable; + boolean_t other_runnable, inactive_state; do { /* @@ -1183,6 +1184,8 @@ thread_select( pset_lock(pset); + inactive_state = processor->state != PROCESSOR_SHUTDOWN && machine_cpu_is_inactive(processor->cpu_num); + simple_lock(&rt_lock); /* @@ -1233,7 +1236,8 @@ thread_select( return (thread); } - if ( (!other_runnable || + if (!inactive_state && + (!other_runnable || (processor->runq.highq < thread->sched_pri && rt_runq.highq < thread->sched_pri)) ) { @@ -1243,6 +1247,8 @@ thread_select( pset_pri_hint(pset, processor, processor->current_pri); + pset_count_hint(pset, processor, processor->runq.count); + processor->deadline = UINT64_MAX; pset_unlock(pset); @@ -1251,11 +1257,51 @@ thread_select( } } - if (other_runnable) - return choose_thread(processor); + if (other_runnable) { + if (processor->runq.count > 0 && processor->runq.highq >= rt_runq.highq) { + simple_unlock(&rt_lock); + + thread = run_queue_dequeue(&processor->runq, SCHED_HEADQ); + + if (!inactive_state) { + pset_pri_hint(pset, processor, thread->sched_pri); + + pset_count_hint(pset, processor, processor->runq.count); + } + + processor->deadline = UINT64_MAX; + pset_unlock(pset); + + return (thread); + } + + thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ); + simple_unlock(&rt_lock); + + processor->deadline = thread->realtime.deadline; + pset_unlock(pset); + + return (thread); + } simple_unlock(&rt_lock); + processor->deadline = UINT64_MAX; + + if (inactive_state) { + if (processor->state == PROCESSOR_RUNNING) + remqueue(&pset->active_queue, (queue_entry_t)processor); + else + if (processor->state == PROCESSOR_IDLE) + remqueue(&pset->idle_queue, (queue_entry_t)processor); + + processor->state = PROCESSOR_INACTIVE; + + pset_unlock(pset); + + return (processor->idle_thread); + } + /* * No runnable threads, attempt to steal * from other processors. @@ -1282,12 +1328,9 @@ thread_select( processor->state = PROCESSOR_IDLE; enqueue_head(&pset->idle_queue, (queue_entry_t)processor); - pset->low_pri = processor; - pset->idle_count++; + pset->low_pri = pset->low_count = processor; } - processor->deadline = UINT64_MAX; - pset_unlock(pset); /* @@ -2019,7 +2062,6 @@ realtime_setrun( */ if (processor->state == PROCESSOR_IDLE) { remqueue(&pset->idle_queue, (queue_entry_t)processor); - pset->idle_count--; enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = thread; @@ -2110,7 +2152,6 @@ processor_setrun( */ if (processor->state == PROCESSOR_IDLE) { remqueue(&pset->idle_queue, (queue_entry_t)processor); - pset->idle_count--; enqueue_tail(&pset->active_queue, (queue_entry_t)processor); processor->next_thread = thread; @@ -2129,7 +2170,7 @@ processor_setrun( if (testbit(thread->sched_pri, sched_preempt_pri)) preempt = (AST_PREEMPT | AST_URGENT); else - if (thread->sched_mode & TH_MODE_TIMESHARE && thread->priority < BASEPRI_BACKGROUND) + if (thread->sched_mode & TH_MODE_TIMESHARE && thread->sched_pri < thread->priority) preempt = AST_NONE; else preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE; @@ -2139,9 +2180,7 @@ processor_setrun( if (preempt != AST_NONE) { if (processor == current_processor()) { - thread_t self = processor->active_thread; - - if (csw_needed(self, processor)) + if (csw_check(processor) != AST_NONE) ast_on(preempt); } else @@ -2207,11 +2246,11 @@ choose_processor( * Prefer the last processor, when appropriate. */ if (processor != PROCESSOR_NULL) { - if (processor->processor_set != pset || + if (processor->processor_set != pset || processor->state == PROCESSOR_INACTIVE || processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE) processor = PROCESSOR_NULL; else - if (processor->state == PROCESSOR_IDLE || processor->current_pri < thread->sched_pri) + if (processor->state == PROCESSOR_IDLE || ( thread->sched_pri > BASEPRI_DEFAULT && processor->current_pri < thread->sched_pri)) return (processor); } @@ -2243,12 +2282,20 @@ choose_processor( } else { /* - * Check the low hint processor in the processor set if available. + * Check any hinted processors in the processor set if available. */ - if (cset->low_pri != PROCESSOR_NULL && - cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE) { - if (processor == PROCESSOR_NULL || cset->low_pri->current_pri < thread->sched_pri) - processor = cset->low_pri; + if (cset->low_pri != PROCESSOR_NULL && cset->low_pri->state != PROCESSOR_INACTIVE && + cset->low_pri->state != PROCESSOR_SHUTDOWN && cset->low_pri->state != PROCESSOR_OFF_LINE && + (processor == PROCESSOR_NULL || + (thread->sched_pri > BASEPRI_DEFAULT && cset->low_pri->current_pri < thread->sched_pri))) { + processor = cset->low_pri; + } + else + if (cset->low_count != PROCESSOR_NULL && cset->low_count->state != PROCESSOR_INACTIVE && + cset->low_count->state != PROCESSOR_SHUTDOWN && cset->low_count->state != PROCESSOR_OFF_LINE && + (processor == PROCESSOR_NULL || + ( thread->sched_pri <= BASEPRI_DEFAULT && cset->low_count->runq.count < processor->runq.count))) { + processor = cset->low_count; } /* @@ -2281,10 +2328,10 @@ choose_processor( do { /* * If we haven't been able to choose a processor, - * pick the current one and return it. + * pick the boot processor and return it. */ if (processor == PROCESSOR_NULL) { - processor = current_processor(); + processor = master_processor; /* * Check that the correct processor set is @@ -2314,7 +2361,8 @@ choose_processor( /* * We must verify that the chosen processor is still available. */ - if (processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE) + if (processor->state == PROCESSOR_INACTIVE || + processor->state == PROCESSOR_SHUTDOWN || processor->state == PROCESSOR_OFF_LINE) processor = PROCESSOR_NULL; } while (processor == PROCESSOR_NULL); @@ -2430,8 +2478,8 @@ thread_setrun( /* * processor_queue_shutdown: * - * Shutdown a processor run queue by moving - * non-bound threads to the current processor. + * Shutdown a processor run queue by + * re-dispatching non-bound threads. * * Associated pset must be locked, and is * returned unlocked. @@ -2480,35 +2528,25 @@ processor_queue_shutdown( pset_unlock(pset); - processor = current_processor(); - pset = processor->processor_set; - while ((thread = (thread_t)dequeue_head(&tqueue)) != THREAD_NULL) { thread_lock(thread); - thread->last_processor = PROCESSOR_NULL; - pset_lock(pset); - - processor_enqueue(processor, thread, SCHED_TAILQ); - - pset_unlock(pset); + thread_setrun(thread, SCHED_TAILQ); thread_unlock(thread); } } /* - * Check for a possible preemption point in - * the (current) thread. + * Check for a preemption point in + * the current context. * * Called at splsched. */ ast_t csw_check( - thread_t thread, processor_t processor) { - int current_pri = thread->sched_pri; ast_t result = AST_NONE; run_queue_t runq; @@ -2517,7 +2555,7 @@ csw_check( if (runq->highq >= BASEPRI_RTQUEUES) return (AST_PREEMPT | AST_URGENT); - if (runq->highq > current_pri) { + if (runq->highq > processor->current_pri) { if (runq->urgency > 0) return (AST_PREEMPT | AST_URGENT); @@ -2525,7 +2563,7 @@ csw_check( } runq = &processor->runq; - if (runq->highq > current_pri) { + if (runq->highq > processor->current_pri) { if (runq->urgency > 0) return (AST_PREEMPT | AST_URGENT); @@ -2534,7 +2572,7 @@ csw_check( } else { runq = &rt_runq; - if (runq->highq >= current_pri) { + if (runq->highq >= processor->current_pri) { if (runq->urgency > 0) return (AST_PREEMPT | AST_URGENT); @@ -2542,7 +2580,7 @@ csw_check( } runq = &processor->runq; - if (runq->highq >= current_pri) { + if (runq->highq >= processor->current_pri) { if (runq->urgency > 0) return (AST_PREEMPT | AST_URGENT); @@ -2553,10 +2591,13 @@ csw_check( if (result != AST_NONE) return (result); - if (thread->state & TH_SUSP) - result |= AST_PREEMPT; + if (machine_cpu_is_inactive(processor->cpu_num)) + return (AST_PREEMPT); - return (result); + if (processor->active_thread->state & TH_SUSP) + return (AST_PREEMPT); + + return (AST_NONE); } /* @@ -2583,11 +2624,11 @@ set_sched_pri( processor_t processor = thread->last_processor; if (thread == current_thread()) { - ast_t preempt = csw_check(thread, processor); + ast_t preempt; - if (preempt != AST_NONE) - ast_on(preempt); processor->current_pri = priority; + if ((preempt = csw_check(processor)) != AST_NONE) + ast_on(preempt); } else if ( processor != PROCESSOR_NULL && @@ -2700,44 +2741,6 @@ run_queue_remove( return (processor != PROCESSOR_NULL); } -/* - * choose_thread: - * - * Choose a thread to execute from the run queues - * and return it. - * - * Called with pset scheduling lock and rt lock held, - * released on return. - */ -static thread_t -choose_thread( - processor_t processor) -{ - processor_set_t pset = processor->processor_set; - thread_t thread; - - if (processor->runq.count > 0 && processor->runq.highq >= rt_runq.highq) { - simple_unlock(&rt_lock); - - thread = run_queue_dequeue(&processor->runq, SCHED_HEADQ); - - pset_pri_hint(pset, processor, thread->sched_pri); - - processor->deadline = UINT64_MAX; - pset_unlock(pset); - - return (thread); - } - - thread = run_queue_dequeue(&rt_runq, SCHED_HEADQ); - simple_unlock(&rt_lock); - - processor->deadline = thread->realtime.deadline; - pset_unlock(pset); - - return (thread); -} - /* * steal_processor_thread: * @@ -2813,7 +2816,6 @@ steal_thread( remqueue(&cset->active_queue, (queue_entry_t)processor); enqueue_tail(&cset->active_queue, (queue_entry_t)processor); - processor->deadline = UINT64_MAX; pset_unlock(cset); return (thread); @@ -2872,6 +2874,9 @@ processor_idle( machine_idle(); (void)splsched(); + + if (processor->state == PROCESSOR_INACTIVE && !machine_cpu_is_inactive(processor->cpu_num)) + break; } timer_switch(&PROCESSOR_DATA(processor, idle_state), @@ -2919,12 +2924,16 @@ processor_idle( else if (state == PROCESSOR_IDLE) { remqueue(&pset->idle_queue, (queue_entry_t)processor); - pset->idle_count--; processor->state = PROCESSOR_RUNNING; enqueue_tail(&pset->active_queue, (queue_entry_t)processor); } else + if (state == PROCESSOR_INACTIVE) { + processor->state = PROCESSOR_RUNNING; + enqueue_tail(&pset->active_queue, (queue_entry_t)processor); + } + else if (state == PROCESSOR_SHUTDOWN) { /* * Going off-line. Force a diff --git a/osfmk/kern/stack.c b/osfmk/kern/stack.c index 68cdcabc0..fe792f997 100644 --- a/osfmk/kern/stack.c +++ b/osfmk/kern/stack.c @@ -61,7 +61,8 @@ decl_simple_lock_data(static,stack_lock_data) static vm_offset_t stack_free_list; static unsigned int stack_free_count, stack_free_hiwat; /* free list count */ -static unsigned int stack_total, stack_hiwat; /* current total count */ +static unsigned int stack_hiwat; +unsigned int stack_total; /* current total count */ static unsigned int stack_free_target; static int stack_free_delta; diff --git a/osfmk/kern/startup.c b/osfmk/kern/startup.c index d0a496a07..a4f1eebd1 100644 --- a/osfmk/kern/startup.c +++ b/osfmk/kern/startup.c @@ -407,7 +407,7 @@ load_context( load_context_kprintf("calling processor_up\n"); processor_up(processor); - PMAP_ACTIVATE_KERNEL(PROCESSOR_DATA(processor, slot_num)); + PMAP_ACTIVATE_KERNEL(processor->cpu_num); /* * Acquire a stack if none attached. The panic @@ -441,7 +441,7 @@ load_context( timer_start(&PROCESSOR_DATA(processor, system_state), processor->last_dispatch); PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state); - PMAP_ACTIVATE_USER(thread, PROCESSOR_DATA(processor, slot_num)); + PMAP_ACTIVATE_USER(thread, processor->cpu_num); load_context_kprintf("calling machine_load_context\n"); machine_load_context(thread); diff --git a/osfmk/kern/syscall_subr.c b/osfmk/kern/syscall_subr.c index 311e96c7d..15af1fa7e 100644 --- a/osfmk/kern/syscall_subr.c +++ b/osfmk/kern/syscall_subr.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -436,7 +436,7 @@ thread_poll_yield( self->depress_timer_active++; thread_unlock(self); - if ((preempt = csw_check(self, myprocessor)) != AST_NONE) + if ((preempt = csw_check(myprocessor)) != AST_NONE) ast_on(preempt); } } diff --git a/osfmk/kern/thread_call.c b/osfmk/kern/thread_call.c index d10c7b4bb..7ae31523c 100644 --- a/osfmk/kern/thread_call.c +++ b/osfmk/kern/thread_call.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1993-1995, 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include #include @@ -46,149 +46,125 @@ #include -#define internal_call_num 768 +decl_simple_lock_data(static,thread_call_lock) -#define thread_call_thread_min 4 +static zone_t thread_call_zone; -static -thread_call_data_t - internal_call_storage[internal_call_num]; +struct thread_call_group { + queue_head_t pending_queue; + uint32_t pending_count; -decl_simple_lock_data(static,thread_call_lock) + queue_head_t delayed_queue; -static -timer_call_data_t - thread_call_delaytimer; + timer_call_data_t delayed_timer; -static -queue_head_t - thread_call_xxx_queue, - thread_call_pending_queue, thread_call_delayed_queue; + struct wait_queue idle_wqueue; + uint32_t idle_count, active_count; +}; -static -struct wait_queue - call_thread_waitqueue; +typedef struct thread_call_group *thread_call_group_t; -static -boolean_t - activate_thread_awake; - -static struct { - int pending_num, - pending_hiwat; - int active_num, - active_hiwat, - active_lowat; - int delayed_num, - delayed_hiwat; - int idle_thread_num; - int thread_num, - thread_hiwat, - thread_lowat; -} thread_call_vars; +static struct thread_call_group thread_call_group0; -static __inline__ thread_call_t - _internal_call_allocate(void); +static boolean_t thread_call_daemon_awake; -static __inline__ void -_internal_call_release( - thread_call_t call -); +#define thread_call_thread_min 4 -static __inline__ void -_pending_call_enqueue( - thread_call_t call -), -_pending_call_dequeue( - thread_call_t call -), -_delayed_call_enqueue( - thread_call_t call -), -_delayed_call_dequeue( - thread_call_t call -); +#define internal_call_count 768 -static __inline__ void -_set_delayed_call_timer( - thread_call_t call -); - -static boolean_t -_remove_from_pending_queue( - thread_call_func_t func, - thread_call_param_t param0, - boolean_t remove_all -), -_remove_from_delayed_queue( - thread_call_func_t func, - thread_call_param_t param0, - boolean_t remove_all -); +static thread_call_data_t internal_call_storage[internal_call_count]; +static queue_head_t thread_call_internal_queue; -static inline void - _call_thread_wake(void); +static __inline__ thread_call_t _internal_call_allocate(void); -static void - _call_thread(void), - _activate_thread(void); +static __inline__ void _internal_call_release( + thread_call_t call); -static void -_delayed_call_timer( - timer_call_param_t p0, - timer_call_param_t p1 -); +static __inline__ boolean_t _pending_call_enqueue( + thread_call_t call, + thread_call_group_t group), + _delayed_call_enqueue( + thread_call_t call, + thread_call_group_t group, + uint64_t deadline), + _call_dequeue( + thread_call_t call, + thread_call_group_t group); + +static __inline__ void thread_call_wake( + thread_call_group_t group); + +static __inline__ void _set_delayed_call_timer( + thread_call_t call, + thread_call_group_t group); + +static boolean_t _remove_from_pending_queue( + thread_call_func_t func, + thread_call_param_t param0, + boolean_t remove_all), + _remove_from_delayed_queue( + thread_call_func_t func, + thread_call_param_t param0, + boolean_t remove_all); + +static void thread_call_daemon( + thread_call_group_t group), + thread_call_thread( + thread_call_group_t group); + +static void thread_call_delayed_timer( + timer_call_param_t p0, + timer_call_param_t p1); #define qe(x) ((queue_entry_t)(x)) #define TC(x) ((thread_call_t)(x)) /* - * Routine: thread_call_initialize [public] - * - * Description: Initialize this module, called - * early during system initialization. + * thread_call_initialize: * - * Preconditions: None. - * - * Postconditions: None. + * Initialize this module, called + * early during system initialization. */ - void thread_call_initialize(void) { - kern_return_t result; - thread_t thread; - thread_call_t call; - spl_t s; + thread_call_t call; + thread_call_group_t group = &thread_call_group0; + kern_return_t result; + thread_t thread; + int i; + spl_t s; + + i = sizeof (thread_call_data_t); + thread_call_zone = zinit(i, 4096 * i, 16 * i, "thread_call"); simple_lock_init(&thread_call_lock, 0); s = splsched(); simple_lock(&thread_call_lock); - queue_init(&thread_call_pending_queue); - queue_init(&thread_call_delayed_queue); + queue_init(&group->pending_queue); + queue_init(&group->delayed_queue); + + timer_call_setup(&group->delayed_timer, thread_call_delayed_timer, group); - queue_init(&thread_call_xxx_queue); + wait_queue_init(&group->idle_wqueue, SYNC_POLICY_FIFO); + + queue_init(&thread_call_internal_queue); for ( call = internal_call_storage; - call < &internal_call_storage[internal_call_num]; + call < &internal_call_storage[internal_call_count]; call++) { - enqueue_tail(&thread_call_xxx_queue, qe(call)); + enqueue_tail(&thread_call_internal_queue, qe(call)); } - timer_call_setup(&thread_call_delaytimer, _delayed_call_timer, NULL); - - wait_queue_init(&call_thread_waitqueue, SYNC_POLICY_FIFO); - thread_call_vars.thread_lowat = thread_call_thread_min; - - activate_thread_awake = TRUE; + thread_call_daemon_awake = TRUE; simple_unlock(&thread_call_lock); splx(s); - result = kernel_thread_start_priority((thread_continue_t)_activate_thread, NULL, MAXPRI_KERNEL - 2, &thread); + result = kernel_thread_start_priority((thread_continue_t)thread_call_daemon, group, BASEPRI_PREEMPT + 1, &thread); if (result != KERN_SUCCESS) panic("thread_call_initialize"); @@ -199,218 +175,170 @@ void thread_call_setup( thread_call_t call, thread_call_func_t func, - thread_call_param_t param0 -) + thread_call_param_t param0) { call_entry_setup(call, func, param0); } /* - * Routine: _internal_call_allocate [private, inline] - * - * Purpose: Allocate an internal callout entry. + * _internal_call_allocate: * - * Preconditions: thread_call_lock held. + * Allocate an internal callout entry. * - * Postconditions: None. + * Called with thread_call_lock held. */ - static __inline__ thread_call_t _internal_call_allocate(void) { thread_call_t call; - if (queue_empty(&thread_call_xxx_queue)) + if (queue_empty(&thread_call_internal_queue)) panic("_internal_call_allocate"); - call = TC(dequeue_head(&thread_call_xxx_queue)); + call = TC(dequeue_head(&thread_call_internal_queue)); return (call); } /* - * Routine: _internal_call_release [private, inline] + * _internal_call_release: * - * Purpose: Release an internal callout entry which - * is no longer pending (or delayed). + * Release an internal callout entry which + * is no longer pending (or delayed). * - * Preconditions: thread_call_lock held. - * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static __inline__ -void +static __inline__ void _internal_call_release( - thread_call_t call -) + thread_call_t call) { if ( call >= internal_call_storage && - call < &internal_call_storage[internal_call_num] ) - enqueue_head(&thread_call_xxx_queue, qe(call)); + call < &internal_call_storage[internal_call_count] ) + enqueue_head(&thread_call_internal_queue, qe(call)); } /* - * Routine: _pending_call_enqueue [private, inline] + * _pending_call_enqueue: * - * Purpose: Place an entry at the end of the - * pending queue, to be executed soon. + * Place an entry at the end of the + * pending queue, to be executed soon. * - * Preconditions: thread_call_lock held. + * Returns TRUE if the entry was already + * on a queue. * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static __inline__ -void +static __inline__ boolean_t _pending_call_enqueue( - thread_call_t call -) + thread_call_t call, + thread_call_group_t group) { - enqueue_tail(&thread_call_pending_queue, qe(call)); - if (++thread_call_vars.pending_num > thread_call_vars.pending_hiwat) - thread_call_vars.pending_hiwat = thread_call_vars.pending_num; + queue_t old_queue; - call->state = PENDING; -} + old_queue = call_entry_enqueue_tail(call, &group->pending_queue); -/* - * Routine: _pending_call_dequeue [private, inline] - * - * Purpose: Remove an entry from the pending queue, - * effectively unscheduling it. - * - * Preconditions: thread_call_lock held. - * - * Postconditions: None. - */ + group->pending_count++; -static __inline__ -void -_pending_call_dequeue( - thread_call_t call -) -{ - (void)remque(qe(call)); - thread_call_vars.pending_num--; - - call->state = IDLE; + return (old_queue != NULL); } /* - * Routine: _delayed_call_enqueue [private, inline] + * _delayed_call_enqueue: * - * Purpose: Place an entry on the delayed queue, - * after existing entries with an earlier - * (or identical) deadline. + * Place an entry on the delayed queue, + * after existing entries with an earlier + * (or identical) deadline. * - * Preconditions: thread_call_lock held. + * Returns TRUE if the entry was already + * on a queue. * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static __inline__ -void +static __inline__ boolean_t _delayed_call_enqueue( - thread_call_t call -) + thread_call_t call, + thread_call_group_t group, + uint64_t deadline) { - thread_call_t current; - - current = TC(queue_first(&thread_call_delayed_queue)); - - while (TRUE) { - if ( queue_end(&thread_call_delayed_queue, qe(current)) || - call->deadline < current->deadline ) { - current = TC(queue_prev(qe(current))); - break; - } - - current = TC(queue_next(qe(current))); - } + queue_t old_queue; - insque(qe(call), qe(current)); - if (++thread_call_vars.delayed_num > thread_call_vars.delayed_hiwat) - thread_call_vars.delayed_hiwat = thread_call_vars.delayed_num; - - call->state = DELAYED; + old_queue = call_entry_enqueue_deadline(call, &group->delayed_queue, deadline); + + if (old_queue == &group->pending_queue) + group->pending_count--; + + return (old_queue != NULL); } /* - * Routine: _delayed_call_dequeue [private, inline] + * _call_dequeue: * - * Purpose: Remove an entry from the delayed queue, - * effectively unscheduling it. + * Remove an entry from a queue. * - * Preconditions: thread_call_lock held. + * Returns TRUE if the entry was on a queue. * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static __inline__ -void -_delayed_call_dequeue( - thread_call_t call -) +static __inline__ boolean_t +_call_dequeue( + thread_call_t call, + thread_call_group_t group) { - (void)remque(qe(call)); - thread_call_vars.delayed_num--; - - call->state = IDLE; + queue_t old_queue; + + old_queue = call_entry_dequeue(call); + + if (old_queue == &group->pending_queue) + group->pending_count--; + + return (old_queue != NULL); } /* - * Routine: _set_delayed_call_timer [private] + * _set_delayed_call_timer: * - * Purpose: Reset the timer so that it - * next expires when the entry is due. + * Reset the timer so that it + * next expires when the entry is due. * - * Preconditions: thread_call_lock held. - * - * Postconditions: None. + * Called with thread_call_lock held. */ - static __inline__ void _set_delayed_call_timer( - thread_call_t call -) + thread_call_t call, + thread_call_group_t group) { - timer_call_enter(&thread_call_delaytimer, call->deadline); + timer_call_enter(&group->delayed_timer, call->deadline); } /* - * Routine: _remove_from_pending_queue [private] + * _remove_from_pending_queue: * - * Purpose: Remove the first (or all) matching - * entries from the pending queue, - * effectively unscheduling them. - * Returns whether any matching entries - * were found. + * Remove the first (or all) matching + * entries from the pending queue. * - * Preconditions: thread_call_lock held. + * Returns TRUE if any matching entries + * were found. * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static -boolean_t +static boolean_t _remove_from_pending_queue( thread_call_func_t func, thread_call_param_t param0, - boolean_t remove_all -) + boolean_t remove_all) { - boolean_t call_removed = FALSE; - thread_call_t call; + boolean_t call_removed = FALSE; + thread_call_t call; + thread_call_group_t group = &thread_call_group0; - call = TC(queue_first(&thread_call_pending_queue)); + call = TC(queue_first(&group->pending_queue)); - while (!queue_end(&thread_call_pending_queue, qe(call))) { + while (!queue_end(&group->pending_queue, qe(call))) { if ( call->func == func && call->param0 == param0 ) { thread_call_t next = TC(queue_next(qe(call))); - _pending_call_dequeue(call); + _call_dequeue(call, group); _internal_call_release(call); @@ -428,38 +356,34 @@ _remove_from_pending_queue( } /* - * Routine: _remove_from_delayed_queue [private] + * _remove_from_delayed_queue: * - * Purpose: Remove the first (or all) matching - * entries from the delayed queue, - * effectively unscheduling them. - * Returns whether any matching entries - * were found. + * Remove the first (or all) matching + * entries from the delayed queue. * - * Preconditions: thread_call_lock held. + * Returns TRUE if any matching entries + * were found. * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static -boolean_t +static boolean_t _remove_from_delayed_queue( thread_call_func_t func, thread_call_param_t param0, - boolean_t remove_all -) + boolean_t remove_all) { - boolean_t call_removed = FALSE; - thread_call_t call; + boolean_t call_removed = FALSE; + thread_call_t call; + thread_call_group_t group = &thread_call_group0; - call = TC(queue_first(&thread_call_delayed_queue)); + call = TC(queue_first(&group->delayed_queue)); - while (!queue_end(&thread_call_delayed_queue, qe(call))) { + while (!queue_end(&group->delayed_queue, qe(call))) { if ( call->func == func && call->param0 == param0 ) { thread_call_t next = TC(queue_next(qe(call))); - _delayed_call_dequeue(call); + _call_dequeue(call, group); _internal_call_release(call); @@ -477,34 +401,29 @@ _remove_from_delayed_queue( } /* - * Routine: thread_call_func [public] - * - * Purpose: Schedule a function callout. - * Guarantees { function, argument } - * uniqueness if unique_call is TRUE. + * thread_call_func: * - * Preconditions: Callable from an interrupt context - * below splsched. + * Enqueue a function callout. * - * Postconditions: None. + * Guarantees { function, argument } + * uniqueness if unique_call is TRUE. */ - void thread_call_func( thread_call_func_t func, thread_call_param_t param, - boolean_t unique_call -) + boolean_t unique_call) { - thread_call_t call; - spl_t s; + thread_call_t call; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - call = TC(queue_first(&thread_call_pending_queue)); + call = TC(queue_first(&group->pending_queue)); - while (unique_call && !queue_end(&thread_call_pending_queue, qe(call))) { + while (unique_call && !queue_end(&group->pending_queue, qe(call))) { if ( call->func == func && call->param0 == param ) { break; @@ -513,16 +432,16 @@ thread_call_func( call = TC(queue_next(qe(call))); } - if (!unique_call || queue_end(&thread_call_pending_queue, qe(call))) { + if (!unique_call || queue_end(&group->pending_queue, qe(call))) { call = _internal_call_allocate(); call->func = func; call->param0 = param; call->param1 = NULL; - _pending_call_enqueue(call); + _pending_call_enqueue(call, group); - if (thread_call_vars.active_num <= 0) - _call_thread_wake(); + if (group->active_count == 0) + thread_call_wake(group); } simple_unlock(&thread_call_lock); @@ -530,26 +449,20 @@ thread_call_func( } /* - * Routine: thread_call_func_delayed [public] - * - * Purpose: Schedule a function callout to - * occur at the stated time. - * - * Preconditions: Callable from an interrupt context - * below splsched. + * thread_call_func_delayed: * - * Postconditions: None. + * Enqueue a function callout to + * occur at the stated time. */ - void thread_call_func_delayed( thread_call_func_t func, thread_call_param_t param, - uint64_t deadline -) + uint64_t deadline) { - thread_call_t call; - spl_t s; + thread_call_t call; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); @@ -558,41 +471,33 @@ thread_call_func_delayed( call->func = func; call->param0 = param; call->param1 = 0; - call->deadline = deadline; - _delayed_call_enqueue(call); + _delayed_call_enqueue(call, group, deadline); - if (queue_first(&thread_call_delayed_queue) == qe(call)) - _set_delayed_call_timer(call); + if (queue_first(&group->delayed_queue) == qe(call)) + _set_delayed_call_timer(call, group); simple_unlock(&thread_call_lock); splx(s); } /* - * Routine: thread_call_func_cancel [public] + * thread_call_func_cancel: * - * Purpose: Unschedule a function callout. - * Removes one (or all) - * { function, argument } - * instance(s) from either (or both) - * the pending and the delayed queue, - * in that order. Returns a boolean - * indicating whether any calls were - * cancelled. + * Dequeue a function callout. * - * Preconditions: Callable from an interrupt context - * below splsched. + * Removes one (or all) { function, argument } + * instance(s) from either (or both) + * the pending and the delayed queue, + * in that order. * - * Postconditions: None. + * Returns TRUE if any calls were cancelled. */ - boolean_t thread_call_func_cancel( thread_call_func_t func, thread_call_param_t param, - boolean_t cancel_all -) + boolean_t cancel_all) { boolean_t result; spl_t s; @@ -614,53 +519,37 @@ thread_call_func_cancel( } /* - * Routine: thread_call_allocate [public] + * thread_call_allocate: * - * Purpose: Allocate an external callout - * entry. - * - * Preconditions: None. - * - * Postconditions: None. + * Allocate a callout entry. */ - thread_call_t thread_call_allocate( thread_call_func_t func, - thread_call_param_t param0 -) + thread_call_param_t param0) { - thread_call_t call = (void *)kalloc(sizeof (thread_call_data_t)); - - call->func = func; - call->param0 = param0; - call->state = IDLE; - + thread_call_t call = zalloc(thread_call_zone); + + call_entry_setup(call, func, param0); + return (call); } /* - * Routine: thread_call_free [public] - * - * Purpose: Free an external callout - * entry. - * - * Preconditions: None. + * thread_call_free: * - * Postconditions: None. + * Free a callout entry. */ - boolean_t thread_call_free( - thread_call_t call -) + thread_call_t call) { spl_t s; s = splsched(); simple_lock(&thread_call_lock); - if (call->state != IDLE) { + if (call->queue != NULL) { simple_unlock(&thread_call_lock); splx(s); @@ -670,46 +559,35 @@ thread_call_free( simple_unlock(&thread_call_lock); splx(s); - kfree(call, sizeof (thread_call_data_t)); + zfree(thread_call_zone, call); return (TRUE); } /* - * Routine: thread_call_enter [public] + * thread_call_enter: * - * Purpose: Schedule an external callout - * entry to occur "soon". Returns a - * boolean indicating whether the call - * had been already scheduled. + * Enqueue a callout entry to occur "soon". * - * Preconditions: Callable from an interrupt context - * below splsched. - * - * Postconditions: None. + * Returns TRUE if the call was + * already on a queue. */ - boolean_t thread_call_enter( - thread_call_t call -) + thread_call_t call) { - boolean_t result = TRUE; - spl_t s; + boolean_t result = TRUE; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - if (call->state != PENDING) { - if (call->state == DELAYED) - _delayed_call_dequeue(call); - else if (call->state == IDLE) - result = FALSE; - - _pending_call_enqueue(call); + if (call->queue != &group->pending_queue) { + result = _pending_call_enqueue(call, group); - if (thread_call_vars.active_num <= 0) - _call_thread_wake(); + if (group->active_count == 0) + thread_call_wake(group); } call->param1 = 0; @@ -723,26 +601,21 @@ thread_call_enter( boolean_t thread_call_enter1( thread_call_t call, - thread_call_param_t param1 -) + thread_call_param_t param1) { - boolean_t result = TRUE; - spl_t s; + boolean_t result = TRUE; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - if (call->state != PENDING) { - if (call->state == DELAYED) - _delayed_call_dequeue(call); - else if (call->state == IDLE) - result = FALSE; - - _pending_call_enqueue(call); - - if (thread_call_vars.active_num <= 0) - _call_thread_wake(); - } + if (call->queue != &group->pending_queue) { + result = _pending_call_enqueue(call, group); + + if (group->active_count == 0) + thread_call_wake(group); + } call->param1 = param1; @@ -753,45 +626,32 @@ thread_call_enter1( } /* - * Routine: thread_call_enter_delayed [public] - * - * Purpose: Schedule an external callout - * entry to occur at the stated time. - * Returns a boolean indicating whether - * the call had been already scheduled. + * thread_call_enter_delayed: * - * Preconditions: Callable from an interrupt context - * below splsched. + * Enqueue a callout entry to occur + * at the stated time. * - * Postconditions: None. + * Returns TRUE if the call was + * already on a queue. */ - boolean_t thread_call_enter_delayed( thread_call_t call, - uint64_t deadline -) + uint64_t deadline) { - boolean_t result = TRUE; - spl_t s; + boolean_t result = TRUE; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - if (call->state == PENDING) - _pending_call_dequeue(call); - else if (call->state == DELAYED) - _delayed_call_dequeue(call); - else if (call->state == IDLE) - result = FALSE; - - call->param1 = 0; - call->deadline = deadline; + result = _delayed_call_enqueue(call, group, deadline); - _delayed_call_enqueue(call); + if (queue_first(&group->delayed_queue) == qe(call)) + _set_delayed_call_timer(call, group); - if (queue_first(&thread_call_delayed_queue) == qe(call)) - _set_delayed_call_timer(call); + call->param1 = 0; simple_unlock(&thread_call_lock); splx(s); @@ -803,29 +663,21 @@ boolean_t thread_call_enter1_delayed( thread_call_t call, thread_call_param_t param1, - uint64_t deadline -) + uint64_t deadline) { - boolean_t result = TRUE; - spl_t s; + boolean_t result = TRUE; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - if (call->state == PENDING) - _pending_call_dequeue(call); - else if (call->state == DELAYED) - _delayed_call_dequeue(call); - else if (call->state == IDLE) - result = FALSE; + result = _delayed_call_enqueue(call, group, deadline); - call->param1 = param1; - call->deadline = deadline; + if (queue_first(&group->delayed_queue) == qe(call)) + _set_delayed_call_timer(call, group); - _delayed_call_enqueue(call); - - if (queue_first(&thread_call_delayed_queue) == qe(call)) - _set_delayed_call_timer(call); + call->param1 = param1; simple_unlock(&thread_call_lock); splx(s); @@ -834,36 +686,25 @@ thread_call_enter1_delayed( } /* - * Routine: thread_call_cancel [public] - * - * Purpose: Unschedule a callout entry. - * Returns a boolean indicating - * whether the call had actually - * been scheduled. + * thread_call_cancel: * - * Preconditions: Callable from an interrupt context - * below splsched. + * Dequeue a callout entry. * - * Postconditions: None. + * Returns TRUE if the call was + * on a queue. */ - boolean_t thread_call_cancel( - thread_call_t call -) + thread_call_t call) { - boolean_t result = TRUE; - spl_t s; + boolean_t result; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - - if (call->state == PENDING) - _pending_call_dequeue(call); - else if (call->state == DELAYED) - _delayed_call_dequeue(call); - else - result = FALSE; + + result = _call_dequeue(call, group); simple_unlock(&thread_call_lock); splx(s); @@ -872,31 +713,26 @@ thread_call_cancel( } /* - * Routine: thread_call_is_delayed [public] - * - * Purpose: Returns a boolean indicating - * whether a call is currently scheduled - * to occur at a later time. Optionally - * returns the expiration time. + * thread_call_is_delayed: * - * Preconditions: Callable from an interrupt context - * below splsched. + * Returns TRUE if the call is + * currently on a delayed queue. * - * Postconditions: None. + * Optionally returns the expiration time. */ - boolean_t thread_call_is_delayed( thread_call_t call, uint64_t *deadline) { - boolean_t result = FALSE; - spl_t s; + boolean_t result = FALSE; + thread_call_group_t group = &thread_call_group0; + spl_t s; s = splsched(); simple_lock(&thread_call_lock); - if (call->state == DELAYED) { + if (call->queue == &group->delayed_queue) { if (deadline != NULL) *deadline = call->deadline; result = TRUE; @@ -909,31 +745,26 @@ thread_call_is_delayed( } /* - * Routine: _call_thread_wake [private, inline] - * - * Purpose: Wake a callout thread to service - * pending callout entries. May wake - * the activate thread in order to - * create additional callout threads. + * thread_call_wake: * - * Preconditions: thread_call_lock held. + * Wake a call thread to service + * pending call entries. May wake + * the daemon thread in order to + * create additional call threads. * - * Postconditions: None. + * Called with thread_call_lock held. */ - -static inline void -_call_thread_wake(void) +static __inline__ void +thread_call_wake( + thread_call_group_t group) { - if (wait_queue_wakeup_one(&call_thread_waitqueue, NULL, THREAD_AWAKENED) == KERN_SUCCESS) { - thread_call_vars.idle_thread_num--; - - if (++thread_call_vars.active_num > thread_call_vars.active_hiwat) - thread_call_vars.active_hiwat = thread_call_vars.active_num; + if (group->idle_count > 0 && wait_queue_wakeup_one(&group->idle_wqueue, NULL, THREAD_AWAKENED) == KERN_SUCCESS) { + group->idle_count--; group->active_count++; } else - if (!activate_thread_awake) { - thread_wakeup_one(&activate_thread_awake); - activate_thread_awake = TRUE; + if (!thread_call_daemon_awake) { + thread_call_daemon_awake = TRUE; + thread_wakeup_one(&thread_call_daemon_awake); } } @@ -942,28 +773,24 @@ _call_thread_wake(void) * * Call out invoked by the scheduler. */ - static void sched_call_thread( - int type, -__unused thread_t thread) + int type, +__unused thread_t thread) { + thread_call_group_t group = &thread_call_group0; + simple_lock(&thread_call_lock); switch (type) { case SCHED_CALL_BLOCK: - if (--thread_call_vars.active_num < thread_call_vars.active_lowat) - thread_call_vars.active_lowat = thread_call_vars.active_num; - - if ( thread_call_vars.active_num <= 0 && - thread_call_vars.pending_num > 0 ) - _call_thread_wake(); + if (--group->active_count == 0 && group->pending_count > 0) + thread_call_wake(group); break; case SCHED_CALL_UNBLOCK: - if (++thread_call_vars.active_num > thread_call_vars.active_hiwat) - thread_call_vars.active_hiwat = thread_call_vars.active_num; + group->active_count++; break; } @@ -971,18 +798,11 @@ __unused thread_t thread) } /* - * Routine: _call_thread [private] - * - * Purpose: Executed by a callout thread. - * - * Preconditions: None. - * - * Postconditions: None. + * thread_call_thread: */ - -static -void -_call_thread_continue(void) +static void +thread_call_thread( + thread_call_group_t group) { thread_t self = current_thread(); @@ -991,19 +811,19 @@ _call_thread_continue(void) thread_sched_call(self, sched_call_thread); - while (thread_call_vars.pending_num > 0) { + while (group->pending_count > 0) { thread_call_t call; thread_call_func_t func; thread_call_param_t param0, param1; - call = TC(dequeue_head(&thread_call_pending_queue)); - thread_call_vars.pending_num--; + call = TC(dequeue_head(&group->pending_queue)); + group->pending_count--; func = call->func; param0 = call->param0; param1 = call->param1; - call->state = IDLE; + call->queue = NULL; _internal_call_release(call); @@ -1016,31 +836,27 @@ _call_thread_continue(void) (*func)(param0, param1); - (void)thread_funnel_set(self->funnel_lock, FALSE); + (void)thread_funnel_set(self->funnel_lock, FALSE); /* XXX */ (void) splsched(); simple_lock(&thread_call_lock); } thread_sched_call(self, NULL); + group->active_count--; - if (--thread_call_vars.active_num < thread_call_vars.active_lowat) - thread_call_vars.active_lowat = thread_call_vars.active_num; - - if (thread_call_vars.idle_thread_num < thread_call_vars.thread_lowat) { - thread_call_vars.idle_thread_num++; + if (group->idle_count < thread_call_thread_min) { + group->idle_count++; - wait_queue_assert_wait(&call_thread_waitqueue, NULL, THREAD_UNINT, 0); + wait_queue_assert_wait(&group->idle_wqueue, NULL, THREAD_UNINT, 0); simple_unlock(&thread_call_lock); (void) spllo(); - thread_block((thread_continue_t)_call_thread_continue); + thread_block_parameter((thread_continue_t)thread_call_thread, group); /* NOTREACHED */ } - - thread_call_vars.thread_num--; - + simple_unlock(&thread_call_lock); (void) spllo(); @@ -1048,27 +864,12 @@ _call_thread_continue(void) /* NOTREACHED */ } -static -void -_call_thread(void) -{ - _call_thread_continue(); - /* NOTREACHED */ -} - /* - * Routine: _activate_thread [private] - * - * Purpose: Executed by the activate thread. - * - * Preconditions: None. - * - * Postconditions: Never terminates. + * thread_call_daemon: */ - -static -void -_activate_thread_continue(void) +static void +thread_call_daemon_continue( + thread_call_group_t group) { kern_return_t result; thread_t thread; @@ -1076,89 +877,78 @@ _activate_thread_continue(void) (void) splsched(); simple_lock(&thread_call_lock); - while ( thread_call_vars.active_num <= 0 && - thread_call_vars.pending_num > 0 ) { - - if (++thread_call_vars.active_num > thread_call_vars.active_hiwat) - thread_call_vars.active_hiwat = thread_call_vars.active_num; - - if (++thread_call_vars.thread_num > thread_call_vars.thread_hiwat) - thread_call_vars.thread_hiwat = thread_call_vars.thread_num; + while (group->active_count == 0 && group->pending_count > 0) { + group->active_count++; simple_unlock(&thread_call_lock); (void) spllo(); - result = kernel_thread_start_priority((thread_continue_t)_call_thread, NULL, MAXPRI_KERNEL - 1, &thread); + result = kernel_thread_start_priority((thread_continue_t)thread_call_thread, group, BASEPRI_PREEMPT, &thread); if (result != KERN_SUCCESS) - panic("activate_thread"); + panic("thread_call_daemon"); thread_deallocate(thread); (void) splsched(); simple_lock(&thread_call_lock); } - - assert_wait(&activate_thread_awake, THREAD_INTERRUPTIBLE); - activate_thread_awake = FALSE; + + thread_call_daemon_awake = FALSE; + assert_wait(&thread_call_daemon_awake, THREAD_UNINT); simple_unlock(&thread_call_lock); (void) spllo(); - thread_block((thread_continue_t)_activate_thread_continue); + thread_block_parameter((thread_continue_t)thread_call_daemon_continue, group); /* NOTREACHED */ } -static -void -_activate_thread(void) +static void +thread_call_daemon( + thread_call_group_t group) { thread_t self = current_thread(); self->options |= TH_OPT_VMPRIV; vm_page_free_reserve(2); /* XXX */ - _activate_thread_continue(); + thread_call_daemon_continue(group); /* NOTREACHED */ } -static -void -_delayed_call_timer( - __unused timer_call_param_t p0, +static void +thread_call_delayed_timer( + timer_call_param_t p0, __unused timer_call_param_t p1 ) { - uint64_t timestamp; - thread_call_t call; - boolean_t new_pending = FALSE; - spl_t s; + thread_call_t call; + thread_call_group_t group = p0; + boolean_t new_pending = FALSE; + uint64_t timestamp; - s = splsched(); simple_lock(&thread_call_lock); - clock_get_uptime(×tamp); + timestamp = mach_absolute_time(); - call = TC(queue_first(&thread_call_delayed_queue)); + call = TC(queue_first(&group->delayed_queue)); - while (!queue_end(&thread_call_delayed_queue, qe(call))) { + while (!queue_end(&group->delayed_queue, qe(call))) { if (call->deadline <= timestamp) { - _delayed_call_dequeue(call); - - _pending_call_enqueue(call); + _pending_call_enqueue(call, group); new_pending = TRUE; } else break; - call = TC(queue_first(&thread_call_delayed_queue)); + call = TC(queue_first(&group->delayed_queue)); } - if (!queue_end(&thread_call_delayed_queue, qe(call))) - _set_delayed_call_timer(call); + if (!queue_end(&group->delayed_queue, qe(call))) + _set_delayed_call_timer(call, group); - if (new_pending && thread_call_vars.active_num <= 0) - _call_thread_wake(); + if (new_pending && group->active_count == 0) + thread_call_wake(group); simple_unlock(&thread_call_lock); - splx(s); } diff --git a/osfmk/kern/thread_call.h b/osfmk/kern/thread_call.h index 296c45079..fbfa0fb2d 100644 --- a/osfmk/kern/thread_call.h +++ b/osfmk/kern/thread_call.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 1993-1995, 1999-2007 Apple Inc. All rights reserved. + * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -45,45 +45,31 @@ typedef void (*thread_call_func_t)( thread_call_param_t param1); __BEGIN_DECLS -boolean_t -thread_call_enter( - thread_call_t call -); - -boolean_t -thread_call_enter1( - thread_call_t call, - thread_call_param_t param1 -); - -boolean_t -thread_call_enter_delayed( - thread_call_t call, - uint64_t deadline -); - -boolean_t -thread_call_enter1_delayed( - thread_call_t call, - thread_call_param_t param1, - uint64_t deadline -); - -boolean_t -thread_call_cancel( - thread_call_t call -); - -thread_call_t -thread_call_allocate( - thread_call_func_t func, - thread_call_param_t param0 -); - -boolean_t -thread_call_free( - thread_call_t call -); +extern boolean_t thread_call_enter( + thread_call_t call); + +extern boolean_t thread_call_enter1( + thread_call_t call, + thread_call_param_t param1); + +extern boolean_t thread_call_enter_delayed( + thread_call_t call, + uint64_t deadline); + +extern boolean_t thread_call_enter1_delayed( + thread_call_t call, + thread_call_param_t param1, + uint64_t deadline); + +extern boolean_t thread_call_cancel( + thread_call_t call); + +extern thread_call_t thread_call_allocate( + thread_call_func_t func, + thread_call_param_t param0); + +extern boolean_t thread_call_free( + thread_call_t call); __END_DECLS @@ -93,15 +79,12 @@ __END_DECLS typedef struct call_entry thread_call_data_t; -void -thread_call_initialize(void); +extern void thread_call_initialize(void); -void -thread_call_setup( - thread_call_t call, - thread_call_func_t func, - thread_call_param_t param0 -); +extern void thread_call_setup( + thread_call_t call, + thread_call_func_t func, + thread_call_param_t param0); #endif /* MACH_KERNEL_PRIVATE */ @@ -113,32 +96,24 @@ __BEGIN_DECLS * Obsolete interfaces. */ -boolean_t -thread_call_is_delayed( - thread_call_t call, - uint64_t *deadline -); - -void -thread_call_func( - thread_call_func_t func, - thread_call_param_t param, - boolean_t unique_call -); - -void -thread_call_func_delayed( - thread_call_func_t func, - thread_call_param_t param, - uint64_t deadline -); - -boolean_t -thread_call_func_cancel( - thread_call_func_t func, - thread_call_param_t param, - boolean_t cancel_all -); +extern boolean_t thread_call_is_delayed( + thread_call_t call, + uint64_t *deadline); + +extern void thread_call_func( + thread_call_func_t func, + thread_call_param_t param, + boolean_t unique_call); + +extern void thread_call_func_delayed( + thread_call_func_t func, + thread_call_param_t param, + uint64_t deadline); + +extern boolean_t thread_call_func_cancel( + thread_call_func_t func, + thread_call_param_t param, + boolean_t cancel_all); #ifndef MACH_KERNEL_PRIVATE diff --git a/osfmk/kern/timer_call.c b/osfmk/kern/timer_call.c index 941061c3d..e091f6707 100644 --- a/osfmk/kern/timer_call.c +++ b/osfmk/kern/timer_call.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1993-2007 Apple Inc. All rights reserved. + * Copyright (c) 1993-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -45,27 +46,13 @@ decl_simple_lock_data(static,timer_call_lock) -static void -timer_call_interrupt( - uint64_t timestamp); - #define qe(x) ((queue_entry_t)(x)) #define TC(x) ((timer_call_t)(x)) void timer_call_initialize(void) { - spl_t s; - simple_lock_init(&timer_call_lock, 0); - - s = splclock(); - simple_lock(&timer_call_lock); - - clock_set_timer_func((clock_timer_func_t)timer_call_interrupt); - - simple_unlock(&timer_call_lock); - splx(s); } void @@ -77,224 +64,205 @@ timer_call_setup( call_entry_setup(call, func, param0); } -static __inline__ -void -_delayed_call_enqueue( - queue_t queue, - timer_call_t call) +__inline__ queue_t +call_entry_enqueue_deadline( + call_entry_t entry, + queue_t queue, + uint64_t deadline) { + queue_t old_queue = entry->queue; timer_call_t current; - current = TC(queue_first(queue)); + if (old_queue != queue || entry->deadline < deadline) { + if (old_queue != queue) + current = TC(queue_first(queue)); + else + current = TC(queue_next(qe(entry))); - while (TRUE) { - if ( queue_end(queue, qe(current)) || - call->deadline < current->deadline ) { - current = TC(queue_prev(qe(current))); - break; + if (old_queue != NULL) + (void)remque(qe(entry)); + + while (TRUE) { + if ( queue_end(queue, qe(current)) || + deadline < current->deadline ) { + current = TC(queue_prev(qe(current))); + break; + } + + current = TC(queue_next(qe(current))); } - current = TC(queue_next(qe(current))); + insque(qe(entry), qe(current)); } + else + if (deadline < entry->deadline) { + current = TC(queue_prev(qe(entry))); - insque(qe(call), qe(current)); + (void)remque(qe(entry)); - call->state = DELAYED; -} + while (TRUE) { + if ( queue_end(queue, qe(current)) || + current->deadline <= deadline ) { + break; + } -static __inline__ -void -_delayed_call_dequeue( - timer_call_t call) -{ - (void)remque(qe(call)); + current = TC(queue_prev(qe(current))); + } - call->state = IDLE; -} + insque(qe(entry), qe(current)); + } -static __inline__ -void -_set_delayed_call_timer( - timer_call_t call) -{ - etimer_set_deadline(call->deadline); + entry->queue = queue; + entry->deadline = deadline; + + return (old_queue); } -boolean_t -timer_call_enter( - timer_call_t call, - uint64_t deadline) +__inline__ queue_t +call_entry_enqueue_tail( + call_entry_t entry, + queue_t queue) { - boolean_t result = TRUE; - queue_t queue; - spl_t s; + queue_t old_queue = entry->queue; - s = splclock(); - simple_lock(&timer_call_lock); + if (old_queue != NULL) + (void)remque(qe(entry)); - if (call->state == DELAYED) - _delayed_call_dequeue(call); - else - result = FALSE; + enqueue_tail(queue, qe(entry)); - call->param1 = NULL; - call->deadline = deadline; + entry->queue = queue; - queue = &PROCESSOR_DATA(current_processor(), timer_call_queue); + return (old_queue); +} - _delayed_call_enqueue(queue, call); +__inline__ queue_t +call_entry_dequeue( + call_entry_t entry) +{ + queue_t old_queue = entry->queue; - if (queue_first(queue) == qe(call)) - _set_delayed_call_timer(call); + if (old_queue != NULL) + (void)remque(qe(entry)); - simple_unlock(&timer_call_lock); - splx(s); + entry->queue = NULL; - return (result); + return (old_queue); } boolean_t -timer_call_enter1( - timer_call_t call, - timer_call_param_t param1, - uint64_t deadline) +timer_call_enter( + timer_call_t call, + uint64_t deadline) { - boolean_t result = TRUE; - queue_t queue; + queue_t queue, old_queue; spl_t s; s = splclock(); simple_lock(&timer_call_lock); - if (call->state == DELAYED) - _delayed_call_dequeue(call); - else - result = FALSE; - - call->param1 = param1; - call->deadline = deadline; - - queue = &PROCESSOR_DATA(current_processor(), timer_call_queue); + queue = timer_queue_assign(deadline); - _delayed_call_enqueue(queue, call); + old_queue = call_entry_enqueue_deadline(call, queue, deadline); - if (queue_first(queue) == qe(call)) - _set_delayed_call_timer(call); + call->param1 = NULL; simple_unlock(&timer_call_lock); splx(s); - return (result); + return (old_queue != NULL); } boolean_t -timer_call_cancel( - timer_call_t call) +timer_call_enter1( + timer_call_t call, + timer_call_param_t param1, + uint64_t deadline) { - boolean_t result = TRUE; + queue_t queue, old_queue; spl_t s; s = splclock(); simple_lock(&timer_call_lock); - if (call->state == DELAYED) { - queue_t queue = &PROCESSOR_DATA(current_processor(), timer_call_queue); + queue = timer_queue_assign(deadline); - if (queue_first(queue) == qe(call)) { - _delayed_call_dequeue(call); + old_queue = call_entry_enqueue_deadline(call, queue, deadline); - if (!queue_empty(queue)) - _set_delayed_call_timer((timer_call_t)queue_first(queue)); - } - else - _delayed_call_dequeue(call); - } - else - result = FALSE; + call->param1 = param1; simple_unlock(&timer_call_lock); splx(s); - return (result); + return (old_queue != NULL); } boolean_t -timer_call_is_delayed( - timer_call_t call, - uint64_t *deadline) +timer_call_cancel( + timer_call_t call) { - boolean_t result = FALSE; + queue_t old_queue; spl_t s; s = splclock(); simple_lock(&timer_call_lock); - if (call->state == DELAYED) { - if (deadline != NULL) - *deadline = call->deadline; - result = TRUE; + old_queue = call_entry_dequeue(call); + + if (old_queue != NULL) { + if (!queue_empty(old_queue)) + timer_queue_cancel(old_queue, call->deadline, TC(queue_first(old_queue))->deadline); + else + timer_queue_cancel(old_queue, call->deadline, UINT64_MAX); } simple_unlock(&timer_call_lock); splx(s); - return (result); + return (old_queue != NULL); } -/* - * Called at splclock. - */ - void -timer_call_shutdown( - processor_t processor) +timer_queue_shutdown( + queue_t queue) { - timer_call_t call; - queue_t queue, myqueue; - - assert(processor != current_processor()); - - queue = &PROCESSOR_DATA(processor, timer_call_queue); - myqueue = &PROCESSOR_DATA(current_processor(), timer_call_queue); + timer_call_t call; + queue_t new_queue; + spl_t s; + s = splclock(); simple_lock(&timer_call_lock); call = TC(queue_first(queue)); while (!queue_end(queue, qe(call))) { - _delayed_call_dequeue(call); + new_queue = timer_queue_assign(call->deadline); - _delayed_call_enqueue(myqueue, call); + call_entry_enqueue_deadline(call, new_queue, call->deadline); call = TC(queue_first(queue)); } - call = TC(queue_first(myqueue)); - - if (!queue_end(myqueue, qe(call))) - _set_delayed_call_timer(call); - simple_unlock(&timer_call_lock); + splx(s); } -static void -timer_call_interrupt(uint64_t timestamp) +uint64_t +timer_queue_expire( + queue_t queue, + uint64_t deadline) { - timer_call_t call; - queue_t queue; + timer_call_t call; simple_lock(&timer_call_lock); - queue = &PROCESSOR_DATA(current_processor(), timer_call_queue); - call = TC(queue_first(queue)); while (!queue_end(queue, qe(call))) { - if (call->deadline <= timestamp) { + if (call->deadline <= deadline) { timer_call_func_t func; timer_call_param_t param0, param1; - _delayed_call_dequeue(call); + call_entry_dequeue(call); func = call->func; param0 = call->param0; @@ -331,14 +299,19 @@ timer_call_interrupt(uint64_t timestamp) (unsigned int)param1, 0, 0); simple_lock(&timer_call_lock); - } else + } + else break; call = TC(queue_first(queue)); } if (!queue_end(queue, qe(call))) - _set_delayed_call_timer(call); + deadline = call->deadline; + else + deadline = UINT64_MAX; simple_unlock(&timer_call_lock); + + return (deadline); } diff --git a/osfmk/kern/timer_call.h b/osfmk/kern/timer_call.h index d3beccfc7..061e3d96c 100644 --- a/osfmk/kern/timer_call.h +++ b/osfmk/kern/timer_call.h @@ -1,6 +1,5 @@ /* - * Copyright (c) 1993-1995, 1999-2000 Apple Computer, Inc. - * All rights reserved. + * Copyright (c) 1993-1995, 1999-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -28,11 +27,6 @@ */ /* * Declarations for timer interrupt callouts. - * - * HISTORY - * - * 20 December 2000 (debo) - * Created. */ #ifndef _KERN_TIMER_CALL_H_ @@ -48,42 +42,28 @@ typedef void (*timer_call_func_t)( timer_call_param_t param0, timer_call_param_t param1); -boolean_t -timer_call_enter( - timer_call_t call, - uint64_t deadline); +extern boolean_t timer_call_enter( + timer_call_t call, + uint64_t deadline); -boolean_t -timer_call_enter1( - timer_call_t call, - timer_call_param_t param1, - uint64_t deadline); +extern boolean_t timer_call_enter1( + timer_call_t call, + timer_call_param_t param1, + uint64_t deadline); -boolean_t -timer_call_cancel( - timer_call_t call); - -boolean_t -timer_call_is_delayed( - timer_call_t call, - uint64_t *deadline); +extern boolean_t timer_call_cancel( + timer_call_t call); #include typedef struct call_entry timer_call_data_t; -void -timer_call_initialize(void); - -void -timer_call_setup( - timer_call_t call, - timer_call_func_t func, - timer_call_param_t param0); +extern void timer_call_initialize(void); -void -timer_call_shutdown( - processor_t processor); +extern void timer_call_setup( + timer_call_t call, + timer_call_func_t func, + timer_call_param_t param0); #endif /* MACH_KERNEL_PRIVATE */ diff --git a/osfmk/kern/timer_queue.h b/osfmk/kern/timer_queue.h new file mode 100644 index 000000000..050b09afa --- /dev/null +++ b/osfmk/kern/timer_queue.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2008 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +/* + * Timer queue support routines. + */ + +#ifndef _KERN_TIMER_QUEUE_H_ +#define _KERN_TIMER_QUEUE_H_ + +#include + +#ifdef MACH_KERNEL_PRIVATE + +#include + +/* + * Invoked by kernel, implemented by platform. + */ + +/* Request an expiration deadline, returns queue association */ +extern queue_t timer_queue_assign( + uint64_t deadline); + +/* Cancel an associated expiration deadline and specify new deadline */ +extern void timer_queue_cancel( + queue_t queue, + uint64_t deadline, + uint64_t new_deadline); + +/* + * Invoked by platform, implemented by kernel. + */ + +/* Process deadline expiration for queue, returns new deadline */ +extern uint64_t timer_queue_expire( + queue_t queue, + uint64_t deadline); + +/* Shutdown a timer queue and reassign existing activities */ +extern void timer_queue_shutdown( + queue_t queue); + +#endif /* MACH_KERNEL_PRIVATE */ + +#endif /* _KERN_TIMER_QUEUE_H_ */ diff --git a/osfmk/kern/zalloc.c b/osfmk/kern/zalloc.c index e189edb2c..f8ac4c12f 100644 --- a/osfmk/kern/zalloc.c +++ b/osfmk/kern/zalloc.c @@ -102,88 +102,73 @@ #include #endif -int check_freed_element = 0; -#if MACH_ASSERT -/* Detect use of zone elt after freeing it by two methods: +/* + * Zone Corruption Debugging + * + * We provide three methods to detect use of a zone element after it's been freed. These + * checks are enabled by specifying "-zc" and/or "-zp" in the boot-args: + * * (1) Range-check the free-list "next" ptr for sanity. * (2) Store the ptr in two different words, and compare them against - * each other when re-using the zone elt, to detect modifications; + * each other when re-using the zone element, to detect modifications. + * (3) poison the freed memory by overwriting it with 0xdeadbeef. + * + * The first two checks are farily light weight and are enabled by specifying "-zc" + * in the boot-args. If you want more aggressive checking for use-after-free bugs + * and you don't mind the additional overhead, then turn on poisoning by adding + * "-zp" to the boot-args in addition to "-zc". If you specify -zp without -zc, + * it still poisons the memory when it's freed, but doesn't check if the memory + * has been altered later when it's reallocated. */ -#if defined(__alpha) - -#define is_kernel_data_addr(a) \ - (!(a) || (IS_SYS_VA(a) && !((a) & (sizeof(long)-1)))) - -#else /* !defined(__alpha) */ +boolean_t check_freed_element = FALSE; /* enabled by -zc in boot-args */ +boolean_t zfree_clear = FALSE; /* enabled by -zp in boot-args */ -#define is_kernel_data_addr(a) \ - (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3))) - -#endif /* defined(__alpha) */ - -/* Should we set all words of the zone element to an illegal address - * when it is freed, to help catch usage after freeing? The down-side - * is that this obscures the identity of the freed element. - */ -boolean_t zfree_clear = FALSE; +#define is_kernel_data_addr(a) (!(a) || ((a) >= vm_min_kernel_address && !((a) & 0x3))) #define ADD_TO_ZONE(zone, element) \ MACRO_BEGIN \ - if (zfree_clear) \ - { unsigned int i; \ - for (i=1; \ - i < zone->elem_size/sizeof(vm_offset_t) - 1; \ - i++) \ - ((vm_offset_t *)(element))[i] = 0xdeadbeef; \ - } \ - ((vm_offset_t *)(element))[0] = (zone)->free_elements; \ - (zone)->free_elements = (vm_offset_t) (element); \ - (zone)->count--; \ -MACRO_END - -#define REMOVE_FROM_ZONE(zone, ret, type) \ -MACRO_BEGIN \ - (ret) = (type) (zone)->free_elements; \ - if ((ret) != (type) 0) { \ - if (!is_kernel_data_addr(((vm_offset_t *)(ret))[0])) { \ - panic("A freed zone element has been modified.\n"); \ - } \ - (zone)->count++; \ - (zone)->free_elements = *((vm_offset_t *)(ret)); \ + if (zfree_clear) \ + { unsigned int i; \ + for (i=0; \ + i < zone->elem_size/sizeof(uint32_t); \ + i++) \ + ((uint32_t *)(element))[i] = 0xdeadbeef; \ } \ -MACRO_END -#else /* MACH_ASSERT */ - -#define ADD_TO_ZONE(zone, element) \ -MACRO_BEGIN \ - *((vm_offset_t *)(element)) = (zone)->free_elements; \ - if (check_freed_element) { \ - if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \ - ((vm_offset_t *)(element))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ - (zone)->free_elements; \ - } \ - (zone)->free_elements = (vm_offset_t) (element); \ - (zone)->count--; \ -MACRO_END - -#define REMOVE_FROM_ZONE(zone, ret, type) \ -MACRO_BEGIN \ - (ret) = (type) (zone)->free_elements; \ - if ((ret) != (type) 0) { \ - if (check_freed_element) { \ - if ((zone)->elem_size >= (2 * sizeof(vm_offset_t)) && \ - ((vm_offset_t *)(ret))[((zone)->elem_size/sizeof(vm_offset_t))-1] != \ - ((vm_offset_t *)(ret))[0]) \ - panic("a freed zone element has been modified");\ - } \ - (zone)->count++; \ - (zone)->free_elements = *((vm_offset_t *)(ret)); \ + *((vm_offset_t *)(element)) = (zone)->free_elements; \ + if (check_freed_element) { \ + if ((zone)->elem_size >= (2 * sizeof(vm_offset_t))) \ + ((vm_offset_t *)(element))[((zone)->elem_size/sizeof(vm_offset_t))-1] = \ + (zone)->free_elements; \ } \ + (zone)->free_elements = (vm_offset_t) (element); \ + (zone)->count--; \ MACRO_END -#endif /* MACH_ASSERT */ +#define REMOVE_FROM_ZONE(zone, ret, type) \ +MACRO_BEGIN \ + (ret) = (type) (zone)->free_elements; \ + if ((ret) != (type) 0) { \ + if (check_freed_element) { \ + if (!is_kernel_data_addr(((vm_offset_t *)(ret))[0]) || \ + ((zone)->elem_size >= (2 * sizeof(vm_offset_t)) && \ + ((vm_offset_t *)(ret))[((zone)->elem_size/sizeof(vm_offset_t))-1] != \ + ((vm_offset_t *)(ret))[0])) \ + panic("a freed zone element has been modified");\ + if (zfree_clear) { \ + unsigned int ii; \ + for (ii = sizeof(vm_offset_t) / sizeof(uint32_t); \ + ii < zone->elem_size/sizeof(uint32_t) - sizeof(vm_offset_t) / sizeof(uint32_t); \ + ii++) \ + if (((uint32_t *)(ret))[ii] != (uint32_t)0xdeadbeef) \ + panic("a freed zone element has been modified");\ + } \ + } \ + (zone)->count++; \ + (zone)->free_elements = *((vm_offset_t *)(ret)); \ + } \ +MACRO_END #if ZONE_DEBUG #define zone_debug_enabled(z) z->active_zones.next @@ -326,10 +311,146 @@ unsigned int num_zones; boolean_t zone_gc_allowed = TRUE; boolean_t zone_gc_forced = FALSE; +boolean_t panic_include_zprint = FALSE; unsigned zone_gc_last_tick = 0; unsigned zone_gc_max_rate = 0; /* in ticks */ +/* + * Zone leak debugging code + * + * When enabled, this code keeps a log to track allocations to a particular zone that have not + * yet been freed. Examining this log will reveal the source of a zone leak. The log is allocated + * only when logging is enabled, so there is no effect on the system when it's turned off. Logging is + * off by default. + * + * Enable the logging via the boot-args. Add the parameter "zlog=" to boot-args where + * is the name of the zone you wish to log. + * + * This code only tracks one zone, so you need to identify which one is leaking first. + * Generally, you'll know you have a leak when you get a "zalloc retry failed 3" panic from the zone + * garbage collector. Note that the zone name printed in the panic message is not necessarily the one + * containing the leak. So do a zprint from gdb and locate the zone with the bloated size. This + * is most likely the problem zone, so set zlog in boot-args to this zone name, reboot and re-run the test. The + * next time it panics with this message, examine the log using the kgmacros zstack, findoldest and countpcs. + * See the help in the kgmacros for usage info. + * + * + * Zone corruption logging + * + * Logging can also be used to help identify the source of a zone corruption. First, identify the zone + * that is being corrupted, then add "-zc zlog=" to the boot-args. When -zc is used in conjunction + * with zlog, it changes the logging style to track both allocations and frees to the zone. So when the + * corruption is detected, examining the log will show you the stack traces of the callers who last allocated + * and freed any particular element in the zone. Use the findelem kgmacro with the address of the element that's been + * corrupted to examine its history. This should lead to the source of the corruption. + */ + +static int log_records; /* size of the log, expressed in number of records */ + +#define MAX_ZONE_NAME 32 /* max length of a zone name we can take from the boot-args */ + +static char zone_name_to_log[MAX_ZONE_NAME] = ""; /* the zone name we're logging, if any */ + +/* + * The number of records in the log is configurable via the zrecs parameter in boot-args. Set this to + * the number of records you want in the log. For example, "zrecs=1000" sets it to 1000 records. Note + * that the larger the size of the log, the slower the system will run due to linear searching in the log, + * but one doesn't generally care about performance when tracking down a leak. The log is capped at 8000 + * records since going much larger than this tends to make the system unresponsive and unbootable on small + * memory configurations. The default value is 4000 records. + * + * MAX_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interrest. 15 + * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual + * caller is up above these lower levels. + */ + +#define ZRECORDS_MAX 8000 /* Max records allowed in the log */ +#define ZRECORDS_DEFAULT 4000 /* default records in log if zrecs is not specificed in boot-args */ +#define MAX_DEPTH 15 /* number of levels of the stack trace to record */ +/* + * Each record in the log contains a pointer to the zone element it refers to, a "time" number that allows + * the records to be ordered chronologically, and a small array to hold the pc's from the stack trace. A + * record is added to the log each time a zalloc() is done in the zone_of_interest. For leak debugging, + * the record is cleared when a zfree() is done. For corruption debugging, the log tracks both allocs and frees. + * If the log fills, old records are replaced as if it were a circular buffer. + */ + +struct zrecord { + void *z_element; /* the element that was zalloc'ed of zfree'ed */ + uint32_t z_opcode:1, /* whether it was a zalloc or zfree */ + z_time:31; /* time index when operation was done */ + void *z_pc[MAX_DEPTH]; /* stack trace of caller */ +}; + +/* + * Opcodes for the z_opcode field: + */ + +#define ZOP_ALLOC 1 +#define ZOP_FREE 0 + +/* + * The allocation log and all the related variables are protected by the zone lock for the zone_of_interest + */ + +static struct zrecord *zrecords; /* the log itself, dynamically allocated when logging is enabled */ +static int zcurrent = 0; /* index of the next slot in the log to use */ +static int zrecorded = 0; /* number of allocations recorded in the log */ +static unsigned int ztime = 0; /* a timestamp of sorts */ +static zone_t zone_of_interest = NULL; /* the zone being watched; corresponds to zone_name_to_log */ + +/* + * Decide if we want to log this zone by doing a string compare between a zone name and the name + * of the zone to log. Return true if the strings are equal, false otherwise. Because it's not + * possible to include spaces in strings passed in via the boot-args, a period in the logname will + * match a space in the zone name. + */ + +static int +log_this_zone(const char *zonename, const char *logname) +{ + int len; + const char *zc = zonename; + const char *lc = logname; + + /* + * Compare the strings. We bound the compare by MAX_ZONE_NAME. + */ + + for (len = 1; len <= MAX_ZONE_NAME; zc++, lc++, len++) { + + /* + * If the current characters don't match, check for a space in + * in the zone name and a corresponding period in the log name. + * If that's not there, then the strings don't match. + */ + + if (*zc != *lc && !(*zc == ' ' && *lc == '.')) + break; + + /* + * The strings are equal so far. If we're at the end, then it's a match. + */ + + if (*zc == '\0') + return TRUE; + } + + return FALSE; +} + + +/* + * Test if we want to log this zalloc/zfree event. We log if this is the zone we're interested in and + * the buffer for the records has been allocated. + */ + +#define DO_LOGGING(z) (zrecords && (z) == zone_of_interest) + +extern boolean_t zlog_ready; + + /* * zinit initializes a new zone. The zone data structures themselves * are stored in a zone, which is initially a static structure that @@ -435,6 +556,40 @@ use_this_allocation: num_zones++; simple_unlock(&all_zones_lock); + /* + * Check if we should be logging this zone. If so, remember the zone pointer. + */ + + if (log_this_zone(z->zone_name, zone_name_to_log)) { + zone_of_interest = z; + } + + /* + * If we want to log a zone, see if we need to allocate buffer space for the log. Some vm related zones are + * zinit'ed before we can do a kmem_alloc, so we have to defer allocation in that case. zlog_ready is set to + * TRUE once enough of the VM system is up and running to allow a kmem_alloc to work. If we want to log one + * of the VM related zones that's set up early on, we will skip allocation of the log until zinit is called again + * later on some other zone. So note we may be allocating a buffer to log a zone other than the one being initialized + * right now. + */ + + if (zone_of_interest != NULL && zrecords == NULL && zlog_ready) { + if (kmem_alloc(kernel_map, (vm_offset_t *)&zrecords, log_records * sizeof(struct zrecord)) == KERN_SUCCESS) { + + /* + * We got the memory for the log. Zero it out since the code needs this to identify unused records. + * At this point, everything is set up and we're ready to start logging this zone. + */ + + bzero((void *)zrecords, log_records * sizeof(struct zrecord)); + printf("zone: logging started for zone %s (%p)\n", zone_of_interest->zone_name, zone_of_interest); + + } else { + printf("zone: couldn't allocate memory for zrecords, turning off zleak logging\n"); + zone_of_interest = NULL; + } + } + return(z); } @@ -613,9 +768,40 @@ zone_bootstrap(void) vm_offset_t zone_zone_space; char temp_buf[16]; - /* see if we want freed zone element checking */ + /* see if we want freed zone element checking and/or poisoning */ if (PE_parse_boot_argn("-zc", temp_buf, sizeof (temp_buf))) { - check_freed_element = 1; + check_freed_element = TRUE; + } + + if (PE_parse_boot_argn("-zp", temp_buf, sizeof (temp_buf))) { + zfree_clear = TRUE; + } + + /* + * Check for and set up zone leak detection if requested via boot-args. We recognized two + * boot-args: + * + * zlog= + * zrecs= + * + * The zlog arg is used to specify the zone name that should be logged, and zrecs is used to + * control the size of the log. If zrecs is not specified, a default value is used. + */ + + if (PE_parse_boot_argn("zlog", zone_name_to_log, sizeof(zone_name_to_log)) == TRUE) { + if (PE_parse_boot_argn("zrecs", &log_records, sizeof(log_records)) == TRUE) { + + /* + * Don't allow more than ZRECORDS_MAX records even if the user asked for more. + * This prevents accidentally hogging too much kernel memory and making the system + * unusable. + */ + + log_records = MIN(ZRECORDS_MAX, log_records); + + } else { + log_records = ZRECORDS_DEFAULT; + } } simple_lock_init(&all_zones_lock, 0); @@ -681,9 +867,19 @@ zalloc_canblock( { vm_offset_t addr; kern_return_t retval; + void *bt[MAX_DEPTH]; /* only used if zone logging is enabled */ + int numsaved = 0; + int i; assert(zone != ZONE_NULL); + /* + * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. + */ + + if (DO_LOGGING(zone)) + numsaved = OSBacktrace(&bt[0], MAX_DEPTH); + lock_zone(zone); REMOVE_FROM_ZONE(zone, addr, vm_offset_t); @@ -765,8 +961,10 @@ zalloc_canblock( zone_gc(); printf("zalloc did gc\n"); } - if (retry == 3) + if (retry == 3) { + panic_include_zprint = TRUE; panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval); + } } else { break; } @@ -826,6 +1024,76 @@ zalloc_canblock( REMOVE_FROM_ZONE(zone, addr, vm_offset_t); } + /* + * See if we should be logging allocations in this zone. Logging is rarely done except when a leak is + * suspected, so this code rarely executes. We need to do this code while still holding the zone lock + * since it protects the various log related data structures. + */ + + if (DO_LOGGING(zone) && addr) { + + /* + * Look for a place to record this new allocation. We implement two different logging strategies + * depending on whether we're looking for the source of a zone leak or a zone corruption. When looking + * for a leak, we want to log as many allocations as possible in order to clearly identify the leaker + * among all the records. So we look for an unused slot in the log and fill that in before overwriting + * an old entry. When looking for a corrution however, it's better to have a chronological log of all + * the allocations and frees done in the zone so that the history of operations for a specific zone + * element can be inspected. So in this case, we treat the log as a circular buffer and overwrite the + * oldest entry whenever a new one needs to be added. + * + * The check_freed_element flag tells us what style of logging to do. It's set if we're supposed to be + * doing corruption style logging (indicated via -zc in the boot-args). + */ + + if (!check_freed_element && zrecords[zcurrent].z_element && zrecorded < log_records) { + + /* + * If we get here, we're doing leak style logging and there's still some unused entries in + * the log (since zrecorded is smaller than the size of the log). Look for an unused slot + * starting at zcurrent and wrap-around if we reach the end of the buffer. If the buffer + * is already full, we just fall through and overwrite the element indexed by zcurrent. + */ + + for (i = zcurrent; i < log_records; i++) { + if (zrecords[i].z_element == NULL) { + zcurrent = i; + goto empty_slot; + } + } + + for (i = 0; i < zcurrent; i++) { + if (zrecords[i].z_element == NULL) { + zcurrent = i; + goto empty_slot; + } + } + } + + /* + * Save a record of this allocation + */ + +empty_slot: + if (zrecords[zcurrent].z_element == NULL) + zrecorded++; + + zrecords[zcurrent].z_element = (void *)addr; + zrecords[zcurrent].z_time = ztime++; + zrecords[zcurrent].z_opcode = ZOP_ALLOC; + + for (i = 0; i < numsaved; i++) + zrecords[zcurrent].z_pc[i] = bt[i]; + + for (; i < MAX_DEPTH; i++) + zrecords[zcurrent].z_pc[i] = 0; + + zcurrent++; + + if (zcurrent >= log_records) + zcurrent = 0; + } + if ((addr == 0) && !canblock && (zone->async_pending == FALSE) && (zone->exhaustible == FALSE) && (!vm_pool_low())) { zone->async_pending = TRUE; unlock_zone(zone); @@ -922,6 +1190,17 @@ zfree( void *addr) { vm_offset_t elem = (vm_offset_t) addr; + void *bt[MAX_DEPTH]; /* only used if zone logging is enable via boot-args */ + int numsaved = 0; + + assert(zone != ZONE_NULL); + + /* + * If zone logging is turned on and this is the zone we're tracking, grab a backtrace. + */ + + if (DO_LOGGING(zone)) + numsaved = OSBacktrace(&bt[0], MAX_DEPTH); #if MACH_ASSERT /* Basic sanity checks */ @@ -945,6 +1224,61 @@ zfree( } lock_zone(zone); + + /* + * See if we're doing logging on this zone. There are two styles of logging used depending on + * whether we're trying to catch a leak or corruption. See comments above in zalloc for details. + */ + + if (DO_LOGGING(zone)) { + int i; + + if (check_freed_element) { + + /* + * We're logging to catch a corruption. Add a record of this zfree operation + * to log. + */ + + if (zrecords[zcurrent].z_element == NULL) + zrecorded++; + + zrecords[zcurrent].z_element = (void *)addr; + zrecords[zcurrent].z_time = ztime++; + zrecords[zcurrent].z_opcode = ZOP_FREE; + + for (i = 0; i < numsaved; i++) + zrecords[zcurrent].z_pc[i] = bt[i]; + + for (; i < MAX_DEPTH; i++) + zrecords[zcurrent].z_pc[i] = 0; + + zcurrent++; + + if (zcurrent >= log_records) + zcurrent = 0; + + } else { + + /* + * We're logging to catch a leak. Remove any record we might have for this + * element since it's being freed. Note that we may not find it if the buffer + * overflowed and that's OK. Since the log is of a limited size, old records + * get overwritten if there are more zallocs than zfrees. + */ + + for (i = 0; i < log_records; i++) { + if (zrecords[i].z_element == addr) { + zrecords[i].z_element = NULL; + zcurrent = i; + zrecorded--; + break; + } + } + } + } + + #if ZONE_DEBUG if (zone_debug_enabled(zone)) { queue_t tmp_elem; diff --git a/osfmk/mach/Makefile b/osfmk/mach/Makefile index e2f2f2de7..946bc45e6 100644 --- a/osfmk/mach/Makefile +++ b/osfmk/mach/Makefile @@ -176,6 +176,10 @@ INSTALL_KF_MI_LCL_LIST = \ mach_interface.h \ $(filter-out mach_traps.h mach_syscalls.h thread_switch.h, ${DATAFILES}) +INSTALL_MI_LCL_LIST = kext_panic_report.h \ + bootstrap.h \ + ${DATAFILES} + INSTALL_MI_GEN_LIST = INSTALL_MI_DIR = mach diff --git a/osfmk/mach/kext_panic_report.h b/osfmk/mach/kext_panic_report.h new file mode 100644 index 000000000..1eb4f384f --- /dev/null +++ b/osfmk/mach/kext_panic_report.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2009 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ +#ifndef _KEXT_PANIC_REPORT_H_ +#define _KEXT_PANIC_REPORT_H_ + +#include + +__BEGIN_DECLS + +/******************************************************************************* +* String-compaction tables for panic reports' kext listing. +*******************************************************************************/ + +typedef struct subs_entry_t { + const char * substring; + char substitute; +} subs_entry_t; + +/* Prefix substitution list. Common prefixes are replaced with a single + * nonalphanumeric character at the beginning of the identifier. + * + * List should be in descending order of # components, and should then + * be in descending frequency order. + */ +subs_entry_t kext_identifier_prefix_subs[] = { + { "com.apple.driver.", '>' }, + { "com.apple.iokit.", '|' }, + { "com.apple.security.", '$' }, + { "com.apple.", '@' }, + + { (char *)0, '\0' } +}; + +/* Substring substitution list. Substrings are replaced with a '!' followed + * by a single letter mapping to the original string. + * + * List should be in descending frequency order, and within + * groups containing same prefix, in descending length order. + */ +subs_entry_t kext_identifier_substring_subs[] = { + { "AppleUSB", 'U' }, + { "Apple", 'A' }, + { "Family", 'F' }, + { "Storage", 'S' }, + { "Controller", 'C' }, + { "Bluetooth", 'B' }, + { "Intel", 'I' }, + + // CHUD kexts, typically not on user installs + { "Profile", 'P' }, + { "Action", 'a' }, // maybe K if we want to stick to all-caps + + { (char *)0, '\0' } +}; + +__END_DECLS +#endif /* _KEXT_PANIC_REPORT_H_ */ diff --git a/osfmk/mach/kmod.h b/osfmk/mach/kmod.h index 4c6f34c05..beaa45af3 100644 --- a/osfmk/mach/kmod.h +++ b/osfmk/mach/kmod.h @@ -189,6 +189,9 @@ extern kern_return_t kmod_send_generic(int type, void *data, int size); extern kern_return_t kmod_initialize_cpp(kmod_info_t *info); extern kern_return_t kmod_finalize_cpp(kmod_info_t *info); +void record_kext_unload(kmod_t kmod_id); +void dump_kext_info(int (*printf_func)(const char *fmt, ...)); + extern void kmod_dump(vm_offset_t *addr, unsigned int dump_cnt); __END_DECLS diff --git a/osfmk/mach/machine.h b/osfmk/mach/machine.h index e28a2c537..3ec6039b9 100644 --- a/osfmk/mach/machine.h +++ b/osfmk/mach/machine.h @@ -347,6 +347,7 @@ __END_DECLS #define CPU_SUBTYPE_ARM_V6 ((cpu_subtype_t) 6) #define CPU_SUBTYPE_ARM_V5TEJ ((cpu_subtype_t) 7) #define CPU_SUBTYPE_ARM_XSCALE ((cpu_subtype_t) 8) +#define CPU_SUBTYPE_ARM_V7 ((cpu_subtype_t) 9) /* * CPU families (sysctl hw.cpufamily) @@ -371,6 +372,7 @@ __END_DECLS #define CPUFAMILY_ARM_9 0xe73283ae #define CPUFAMILY_ARM_11 0x8ff620d8 #define CPUFAMILY_ARM_XSCALE 0x53b005f5 +#define CPUFAMILY_ARM_13 0x0cc90e64 #define CPUFAMILY_INTEL_YONAH CPUFAMILY_INTEL_6_14 #define CPUFAMILY_INTEL_MEROM CPUFAMILY_INTEL_6_15 diff --git a/osfmk/mach/port.h b/osfmk/mach/port.h index ff3360297..5e1186540 100644 --- a/osfmk/mach/port.h +++ b/osfmk/mach/port.h @@ -308,6 +308,7 @@ typedef struct mach_port_status { #define MACH_PORT_QLIMIT_BASIC ((mach_port_msgcount_t) 5) #define MACH_PORT_QLIMIT_SMALL ((mach_port_msgcount_t) 16) #define MACH_PORT_QLIMIT_LARGE ((mach_port_msgcount_t) 1024) +#define MACH_PORT_QLIMIT_KERNEL ((mach_port_msgcount_t) 65536) #define MACH_PORT_QLIMIT_MIN MACH_PORT_QLIMIT_ZERO #define MACH_PORT_QLIMIT_DEFAULT MACH_PORT_QLIMIT_BASIC #define MACH_PORT_QLIMIT_MAX MACH_PORT_QLIMIT_LARGE diff --git a/osfmk/ppc/Diagnostics.c b/osfmk/ppc/Diagnostics.c index 5ac1d203a..df6f7e01d 100644 --- a/osfmk/ppc/Diagnostics.c +++ b/osfmk/ppc/Diagnostics.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -468,7 +468,7 @@ int diagCall(struct savearea *save) { prssr = (processor_t)port->ip_kobject; /* Extract the processor */ is_write_unlock(current_space()); /* All done with the space now, unlock it */ - save->save_r3 = (uint64_t)(uint32_t)PerProcTable[prssr->processor_data.slot_num].ppe_vaddr; /* Pass back ther per proc */ + save->save_r3 = (uint64_t)(uint32_t)PerProcTable[prssr->cpu_num].ppe_vaddr; /* Pass back ther per proc */ return -1; /* Return and check asts */ /* diff --git a/osfmk/ppc/cpu.c b/osfmk/ppc/cpu.c index 3a77bccb6..aa6727c90 100644 --- a/osfmk/ppc/cpu.c +++ b/osfmk/ppc/cpu.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2009 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -241,6 +242,9 @@ cpu_per_proc_alloc( proc_info->debstackptr = (vm_offset_t)debugger_stack + KERNEL_STACK_SIZE - FM_SIZE; proc_info->debstack_top_ss = proc_info->debstackptr; + queue_init(&proc_info->rtclock_timer.queue); + proc_info->rtclock_timer.deadline = EndOfAllTime; + return proc_info; } @@ -427,6 +431,11 @@ cpu_sleep( proc_info->running = FALSE; + if (proc_info->cpu_number != master_cpu) { + timer_queue_shutdown(&proc_info->rtclock_timer.queue); + proc_info->rtclock_timer.deadline = EndOfAllTime; + } + fowner = proc_info->FPU_owner; /* Cache this */ if(fowner) /* If anyone owns FPU, save it */ fpu_save(fowner); diff --git a/osfmk/ppc/etimer.c b/osfmk/ppc/etimer.c index 4d3bb1a5f..dca034b91 100644 --- a/osfmk/ppc/etimer.c +++ b/osfmk/ppc/etimer.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -53,9 +54,6 @@ #include #include -/* XXX from /rtclock.c */ -clock_timer_func_t rtclock_timer_expire; - /* * Event timer interrupt. * @@ -91,8 +89,7 @@ __unused uint64_t iaddr) /* has a pending clock timer expired? */ if (mytimer->deadline <= abstime) { /* Have we expired the deadline? */ mytimer->has_expired = TRUE; /* Remember that we popped */ - mytimer->deadline = EndOfAllTime; /* Set timer request to the end of all time in case we have no more events */ - (*rtclock_timer_expire)(abstime); /* Process pop */ + mytimer->deadline = timer_queue_expire(&mytimer->queue, abstime); mytimer->has_expired = FALSE; } @@ -102,7 +99,7 @@ __unused uint64_t iaddr) } /* - * Set the clock deadline; called by the thread scheduler. + * Set the clock deadline. */ void etimer_set_deadline(uint64_t deadline) { @@ -165,3 +162,34 @@ etimer_resync_deadlines(void) } splx(s); } + +queue_t +timer_queue_assign( + uint64_t deadline) +{ + struct per_proc_info *pp = getPerProc(); + rtclock_timer_t *timer; + + if (pp->running) { + timer = &pp->rtclock_timer; + + if (deadline < timer->deadline) + etimer_set_deadline(deadline); + } + else + timer = &PerProcTable[master_cpu].ppe_vaddr->rtclock_timer; + + return (&timer->queue); +} + +void +timer_queue_cancel( + queue_t queue, + uint64_t deadline, + uint64_t new_deadline) +{ + if (queue == &getPerProc()->rtclock_timer.queue) { + if (deadline < new_deadline) + etimer_set_deadline(new_deadline); + } +} diff --git a/osfmk/ppc/exception.h b/osfmk/ppc/exception.h index 75f7e2e28..394b884e4 100644 --- a/osfmk/ppc/exception.h +++ b/osfmk/ppc/exception.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Computer, Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -349,7 +349,6 @@ struct per_proc_info { /* PPC cache line boundary here - 140 */ void * pp_cbfr; void * pp_chud; - uint64_t rtclock_intr_deadline; rtclock_timer_t rtclock_timer; unsigned int ppbbTaskEnv; /* BlueBox Task Environment */ diff --git a/osfmk/ppc/machine_routines.c b/osfmk/ppc/machine_routines.c index 9386f8597..ad4add6f0 100644 --- a/osfmk/ppc/machine_routines.c +++ b/osfmk/ppc/machine_routines.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2007 Apple Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -815,3 +815,18 @@ void ml_mem_backoff(void) { return; } + + +/* + * Stubs for CPU Stepper + */ +void +machine_run_count(__unused uint32_t count) +{ +} + +boolean_t +machine_cpu_is_inactive(__unused int num) +{ + return(FALSE); +} diff --git a/osfmk/ppc/model_dep.c b/osfmk/ppc/model_dep.c index a6dcb6577..e6dc6435f 100644 --- a/osfmk/ppc/model_dep.c +++ b/osfmk/ppc/model_dep.c @@ -451,7 +451,8 @@ print_backtrace(struct savearea *ssp) while(pbtcnt); /* Wait for completion */ pbt_exit: panic_display_system_configuration(); - + panic_display_zprint(); + dump_kext_info(&kdb_log); return; } diff --git a/osfmk/ppc/ppc_init.c b/osfmk/ppc/ppc_init.c index 35526ab2c..ccdbb8bb9 100644 --- a/osfmk/ppc/ppc_init.c +++ b/osfmk/ppc/ppc_init.c @@ -165,6 +165,8 @@ ppc_init( BootProcInfo.VMX_owner = NULL; BootProcInfo.pp_cbfr = console_per_proc_alloc(TRUE); BootProcInfo.rtcPop = EndOfAllTime; + queue_init(&BootProcInfo.rtclock_timer.queue); + BootProcInfo.rtclock_timer.deadline = EndOfAllTime; BootProcInfo.pp2ndPage = (addr64_t)(uintptr_t)&BootProcInfo; /* Initial physical address of the second page */ BootProcInfo.pms.pmsStamp = 0; /* Dummy transition time */ diff --git a/osfmk/ppc/rtclock.c b/osfmk/ppc/rtclock.c index 90a5754ae..7c1222bd0 100644 --- a/osfmk/ppc/rtclock.c +++ b/osfmk/ppc/rtclock.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -66,9 +66,6 @@ static mach_timebase_info_data_t rtclock_timebase_const; static boolean_t rtclock_timebase_initialized; -/* XXX this should really be in a header somewhere */ -extern clock_timer_func_t rtclock_timer_expire; - decl_simple_lock_data(static,rtclock_lock) /* @@ -214,18 +211,6 @@ clock_timebase_info( UNLOCK_RTC(s); } -void -clock_set_timer_func( - clock_timer_func_t func) -{ - spl_t s; - - LOCK_RTC(s); - if (rtclock_timer_expire == NULL) - rtclock_timer_expire = func; - UNLOCK_RTC(s); -} - void clock_interval_to_absolutetime_interval( uint32_t interval, diff --git a/osfmk/ppc/rtclock.h b/osfmk/ppc/rtclock.h index ed0cbb333..77f287ead 100644 --- a/osfmk/ppc/rtclock.h +++ b/osfmk/ppc/rtclock.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2004-2006 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2004-2008 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -48,6 +48,7 @@ extern void rtclock_intr(struct savearea *ssp); #pragma pack(push,4) struct rtclock_timer_t { + queue_head_t queue; uint64_t deadline; uint32_t /*boolean_t*/ is_set:1, diff --git a/osfmk/vm/memory_object.c b/osfmk/vm/memory_object.c index f83dd5e88..a89aa0ef0 100644 --- a/osfmk/vm/memory_object.c +++ b/osfmk/vm/memory_object.c @@ -636,7 +636,9 @@ vm_object_update_extent( m->list_req_pending = TRUE; m->cleaning = TRUE; - if (should_flush) { + if (should_flush && + /* let's no flush a wired page... */ + !m->wire_count) { /* * and add additional state * for the flush diff --git a/osfmk/vm/vm_apple_protect.c b/osfmk/vm/vm_apple_protect.c index 54e618e40..da167635f 100644 --- a/osfmk/vm/vm_apple_protect.c +++ b/osfmk/vm/vm_apple_protect.c @@ -412,6 +412,7 @@ apple_protect_pager_data_request( pl_count = length / PAGE_SIZE; for (cur_offset = 0; cur_offset < length; cur_offset += PAGE_SIZE) { ppnum_t dst_pnum; + int type_of_fault; if (!upl_page_present(upl_pl, cur_offset / PAGE_SIZE)) { /* this page is not in the UPL: skip it */ @@ -435,7 +436,7 @@ apple_protect_pager_data_request( &prot, &src_page, &top_page, - NULL, + &type_of_fault, &error_code, FALSE, FALSE, diff --git a/osfmk/vm/vm_fault.c b/osfmk/vm/vm_fault.c index 77a34c912..53ba64bee 100644 --- a/osfmk/vm/vm_fault.c +++ b/osfmk/vm/vm_fault.c @@ -2256,6 +2256,7 @@ vm_fault( boolean_t need_collapse = FALSE; int object_lock_type = 0; int cur_object_lock_type; + vm_object_t top_object = VM_OBJECT_NULL; KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START, @@ -2618,16 +2619,26 @@ RetryFault: prot &= ~VM_PROT_WRITE; - /* - * Set up to map the page... - * mark the page busy, drop - * unneeded object lock - */ if (object != cur_object) { - /* - * don't need the original object anymore + /* + * We still need to hold the top object + * lock here to prevent a race between + * a read fault (taking only "shared" + * locks) and a write fault (taking + * an "exclusive" lock on the top + * object. + * Otherwise, as soon as we release the + * top lock, the write fault could + * proceed and actually complete before + * the read fault, and the copied page's + * translation could then be overwritten + * by the read fault's translation for + * the original page. + * + * Let's just record what the top object + * is and we'll release it later. */ - vm_object_unlock(object); + top_object = object; /* * switch to the object that has the new page @@ -2668,6 +2679,20 @@ FastPmapEnter: &type_of_fault); } + if (top_object != VM_OBJECT_NULL) { + /* + * It's safe to drop the top object + * now that we've done our + * vm_fault_enter(). Any other fault + * in progress for that virtual + * address will either find our page + * and translation or put in a new page + * and translation. + */ + vm_object_unlock(top_object); + top_object = VM_OBJECT_NULL; + } + if (need_collapse == TRUE) vm_object_collapse(object, offset, TRUE); diff --git a/osfmk/vm/vm_init.c b/osfmk/vm/vm_init.c index 7b6b17dc6..f5e05931e 100644 --- a/osfmk/vm/vm_init.c +++ b/osfmk/vm/vm_init.c @@ -87,6 +87,7 @@ const vm_offset_t vm_min_kernel_address = VM_MIN_KERNEL_ADDRESS; const vm_offset_t vm_max_kernel_address = VM_MAX_KERNEL_ADDRESS; boolean_t vm_kernel_ready = FALSE; +boolean_t zlog_ready = FALSE; /* * vm_mem_bootstrap initializes the virtual memory system. @@ -131,6 +132,8 @@ vm_mem_bootstrap(void) vm_mem_bootstrap_kprintf(("vm_mem_bootstrap: calling pmap_init\n")); pmap_init(); + zlog_ready = TRUE; + if (PE_parse_boot_argn("zsize", &zsizearg, sizeof (zsizearg))) zsize = zsizearg * 1024ULL * 1024ULL; else { diff --git a/pexpert/i386/pe_misc.s b/pexpert/i386/pe_misc.s index 5bad35b4b..a5b224b70 100644 --- a/pexpert/i386/pe_misc.s +++ b/pexpert/i386/pe_misc.s @@ -39,6 +39,7 @@ ENTRY(PE_get_timebase) movl S_ARG0, %ecx + lfence rdtsc lfence diff --git a/pexpert/pexpert/pexpert.h b/pexpert/pexpert/pexpert.h index d8a013397..42e96d977 100644 --- a/pexpert/pexpert/pexpert.h +++ b/pexpert/pexpert/pexpert.h @@ -60,6 +60,7 @@ void PE_init_platform( + void PE_init_kprintf( boolean_t vm_initialized); diff --git a/security/mac_framework.h b/security/mac_framework.h index 58f3e2b33..c3ea61435 100644 --- a/security/mac_framework.h +++ b/security/mac_framework.h @@ -159,7 +159,7 @@ int mac_cred_label_externalize_audit(proc_t p, struct mac *mac); void mac_cred_label_free(struct label *label); void mac_cred_label_init(kauth_cred_t cred); void mac_cred_label_update(kauth_cred_t cred, struct label *newlabel); -void mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred, +int mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t newcred, struct vnode *vp, struct label *scriptvnodelabel, struct label *execlabel); void mac_devfs_label_associate_device(dev_t dev, struct devnode *de, diff --git a/security/mac_policy.h b/security/mac_policy.h index 21b645a73..544565552 100644 --- a/security/mac_policy.h +++ b/security/mac_policy.h @@ -528,6 +528,10 @@ typedef int mpo_cred_label_internalize_t( The final label, execlabel, corresponds to a label supplied by a user space application through the use of the mac_execve system call. + If non-NULL, the value pointed to by disjointp will be set to 0 to + indicate that the old and new credentials are not disjoint, or 1 to + indicate that they are. + The vnode lock is held during this operation. No changes should be made to the old credential structure. */ @@ -537,7 +541,8 @@ typedef void mpo_cred_label_update_execve_t( struct vnode *vp, struct label *vnodelabel, struct label *scriptvnodelabel, - struct label *execlabel + struct label *execlabel, + int *disjointp ); /** @brief Update a credential label diff --git a/security/mac_vfs.c b/security/mac_vfs.c index 2bbfb04db..8910d6d72 100644 --- a/security/mac_vfs.c +++ b/security/mac_vfs.c @@ -413,21 +413,24 @@ mac_vnode_label_store(vfs_context_t ctx, struct vnode *vp, return (error); } -void +int mac_cred_label_update_execve(vfs_context_t ctx, kauth_cred_t new, struct vnode *vp, struct label *scriptvnodelabel, struct label *execl) { kauth_cred_t cred; + int disjoint = 0; if (!mac_proc_enforce && !mac_vnode_enforce) - return; + return disjoint; /* mark the new cred to indicate "matching" includes the label */ new->cr_flags |= CRF_MAC_ENFORCE; cred = vfs_context_ucred(ctx); MAC_PERFORM(cred_label_update_execve, cred, new, vp, vp->v_label, - scriptvnodelabel, execl); + scriptvnodelabel, execl, &disjoint); + + return (disjoint); } int -- 2.45.2