kern/kern_support.c

   1 /*
   2  * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28 /* Copyright (c) 1995-2005 Apple Computer, Inc. All Rights Reserved */
  29 /*
  30  *      pthread_synch.c
  31  */
  32
  33 #pragma mark - Front Matter
  34
  35 #define _PTHREAD_CONDATTR_T
  36 #define _PTHREAD_COND_T
  37 #define _PTHREAD_MUTEXATTR_T
  38 #define _PTHREAD_MUTEX_T
  39 #define _PTHREAD_RWLOCKATTR_T
  40 #define _PTHREAD_RWLOCK_T
  41
  42 #undef pthread_mutexattr_t
  43 #undef pthread_mutex_t
  44 #undef pthread_condattr_t
  45 #undef pthread_cond_t
  46 #undef pthread_rwlockattr_t
  47 #undef pthread_rwlock_t
  48
  49 #include <sys/cdefs.h>
  50 #include <os/log.h>
  51
  52 // <rdar://problem/26158937> panic() should be marked noreturn
  53 extern void panic(const char *string, ...) __printflike(1,2) __dead2;
  54
  55 #include <sys/param.h>
  56 #include <sys/queue.h>
  57 #include <sys/resourcevar.h>
  58 //#include <sys/proc_internal.h>
  59 #include <sys/kauth.h>
  60 #include <sys/systm.h>
  61 #include <sys/timeb.h>
  62 #include <sys/times.h>
  63 #include <sys/acct.h>
  64 #include <sys/kernel.h>
  65 #include <sys/wait.h>
  66 #include <sys/signalvar.h>
  67 #include <sys/sysctl.h>
  68 #include <sys/syslog.h>
  69 #include <sys/stat.h>
  70 #include <sys/lock.h>
  71 #include <sys/kdebug.h>
  72 //#include <sys/sysproto.h>
  73 #include <sys/vm.h>
  74 #include <sys/user.h>           /* for coredump */
  75 #include <sys/proc_info.h>      /* for fill_procworkqueue */
  76
  77 #include <mach/mach_port.h>
  78 #include <mach/mach_types.h>
  79 #include <mach/semaphore.h>
  80 #include <mach/sync_policy.h>
  81 #include <mach/task.h>
  82 #include <mach/vm_prot.h>
  83 #include <kern/kern_types.h>
  84 #include <kern/task.h>
  85 #include <kern/clock.h>
  86 #include <mach/kern_return.h>
  87 #include <kern/thread.h>
  88 #include <kern/zalloc.h>
  89 #include <kern/sched_prim.h>    /* for thread_exception_return */
  90 #include <kern/processor.h>
  91 #include <kern/assert.h>
  92 #include <mach/mach_vm.h>
  93 #include <mach/mach_param.h>
  94 #include <mach/thread_status.h>
  95 #include <mach/thread_policy.h>
  96 #include <mach/message.h>
  97 #include <mach/port.h>
  98 //#include <vm/vm_protos.h>
  99 #include <vm/vm_fault.h>
 100 #include <vm/vm_map.h>
 101 #include <mach/thread_act.h> /* for thread_resume */
 102 #include <machine/machine_routines.h>
 103 #include <mach/shared_region.h>
 104
 105 #include <libkern/OSAtomic.h>
 106 #include <libkern/libkern.h>
 107
 108 #include "kern_internal.h"
 109
 110 #ifndef WQ_SETUP_EXIT_THREAD
 111 #define WQ_SETUP_EXIT_THREAD    8
 112 #endif
 113
 114 // XXX: Ditto for thread tags from kern/thread.h
 115 #define THREAD_TAG_MAINTHREAD 0x1
 116 #define THREAD_TAG_PTHREAD 0x10
 117 #define THREAD_TAG_WORKQUEUE 0x20
 118
 119 lck_grp_attr_t   *pthread_lck_grp_attr;
 120 lck_grp_t    *pthread_lck_grp;
 121 lck_attr_t   *pthread_lck_attr;
 122
 123 #define C_32_STK_ALIGN          16
 124 #define C_64_STK_ALIGN          16
 125 #define C_64_REDZONE_LEN        128
 126
 127 // WORKQ use the largest alignment any platform needs
 128 #define C_WORKQ_STK_ALIGN       16
 129
 130 #define PTHREAD_T_OFFSET 0
 131
 132 /*
 133  * Flags filed passed to bsdthread_create and back in pthread_start
 134 31  <---------------------------------> 0
 135 _________________________________________
 136 | flags(8) | policy(8) | importance(16) |
 137 -----------------------------------------
 138 */
 139
 140 #define PTHREAD_START_CUSTOM            0x01000000 // <rdar://problem/34501401>
 141 #define PTHREAD_START_SETSCHED          0x02000000
 142 // was PTHREAD_START_DETACHED           0x04000000
 143 #define PTHREAD_START_QOSCLASS          0x08000000
 144 #define PTHREAD_START_TSD_BASE_SET      0x10000000
 145 #define PTHREAD_START_SUSPENDED         0x20000000
 146 #define PTHREAD_START_QOSCLASS_MASK     0x00ffffff
 147 #define PTHREAD_START_POLICY_BITSHIFT 16
 148 #define PTHREAD_START_POLICY_MASK 0xff
 149 #define PTHREAD_START_IMPORTANCE_MASK 0xffff
 150
 151 #define SCHED_OTHER      POLICY_TIMESHARE
 152 #define SCHED_FIFO       POLICY_FIFO
 153 #define SCHED_RR         POLICY_RR
 154
 155 #define BASEPRI_DEFAULT 31
 156
 157 uint32_t pthread_debug_tracing = 1;
 158
 159 static uint32_t pthread_mutex_default_policy;
 160
 161 SYSCTL_INT(_kern, OID_AUTO, pthread_mutex_default_policy, CTLFLAG_RW | CTLFLAG_LOCKED,
 162            &pthread_mutex_default_policy, 0, "");
 163
 164 #pragma mark - Process/Thread Setup/Teardown syscalls
 165
 166 static mach_vm_offset_t
 167 stack_addr_hint(proc_t p, vm_map_t vmap)
 168 {
 169         mach_vm_offset_t stackaddr;
 170         mach_vm_offset_t aslr_offset;
 171         bool proc64bit = proc_is64bit(p);
 172
 173         // We can't safely take random values % something unless its a power-of-two
 174         _Static_assert(powerof2(PTH_DEFAULT_STACKSIZE), "PTH_DEFAULT_STACKSIZE is a power-of-two");
 175
 176 #if defined(__i386__) || defined(__x86_64__)
 177         if (proc64bit) {
 178                 // Matches vm_map_get_max_aslr_slide_pages's image shift in xnu
 179                 aslr_offset = random() % (1 << 28); // about 512 stacks
 180         } else {
 181                 // Actually bigger than the image shift, we've got ~256MB to work with
 182                 aslr_offset = random() % (16 * PTH_DEFAULT_STACKSIZE);
 183         }
 184         aslr_offset = vm_map_trunc_page_mask(aslr_offset, vm_map_page_mask(vmap));
 185         if (proc64bit) {
 186                 // Above nanomalloc range (see NANOZONE_SIGNATURE)
 187                 stackaddr = 0x700000000000 + aslr_offset;
 188         } else {
 189                 stackaddr = SHARED_REGION_BASE_I386 + SHARED_REGION_SIZE_I386 + aslr_offset;
 190         }
 191 #elif defined(__arm__) || defined(__arm64__)
 192         user_addr_t main_thread_stack_top = 0;
 193         if (pthread_kern->proc_get_user_stack) {
 194                 main_thread_stack_top = pthread_kern->proc_get_user_stack(p);
 195         }
 196         if (proc64bit && main_thread_stack_top) {
 197                 // The main thread stack position is randomly slid by xnu (c.f.
 198                 // load_main() in mach_loader.c), so basing pthread stack allocations
 199                 // where the main thread stack ends is already ASLRd and doing so
 200                 // avoids creating a gap in the process address space that may cause
 201                 // extra PTE memory usage. rdar://problem/33328206
 202                 stackaddr = vm_map_trunc_page_mask((vm_map_offset_t)main_thread_stack_top,
 203                                 vm_map_page_mask(vmap));
 204         } else {
 205                 // vm_map_get_max_aslr_slide_pages ensures 1MB of slide, we do better
 206                 aslr_offset = random() % ((proc64bit ? 4 : 2) * PTH_DEFAULT_STACKSIZE);
 207                 aslr_offset = vm_map_trunc_page_mask((vm_map_offset_t)aslr_offset,
 208                                 vm_map_page_mask(vmap));
 209                 if (proc64bit) {
 210                         // 64 stacks below shared region
 211                         stackaddr = SHARED_REGION_BASE_ARM64 - 64 * PTH_DEFAULT_STACKSIZE - aslr_offset;
 212                 } else {
 213                         // If you try to slide down from this point, you risk ending up in memory consumed by malloc
 214                         stackaddr = SHARED_REGION_BASE_ARM - 32 * PTH_DEFAULT_STACKSIZE + aslr_offset;
 215                 }
 216         }
 217 #else
 218 #error Need to define a stack address hint for this architecture
 219 #endif
 220         return stackaddr;
 221 }
 222
 223 static bool
 224 _pthread_priority_to_policy(pthread_priority_t priority,
 225                 thread_qos_policy_data_t *data)
 226 {
 227         data->qos_tier = _pthread_priority_thread_qos(priority);
 228         data->tier_importance = _pthread_priority_relpri(priority);
 229         if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
 230                         data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
 231                 return false;
 232         }
 233         return true;
 234 }
 235
 236 /**
 237  * bsdthread_create system call.  Used by pthread_create.
 238  */
 239 int
 240 _bsdthread_create(struct proc *p,
 241                 __unused user_addr_t user_func, __unused user_addr_t user_funcarg,
 242                 user_addr_t user_stack, user_addr_t user_pthread, uint32_t flags,
 243                 user_addr_t *retval)
 244 {
 245         kern_return_t kret;
 246         void * sright;
 247         int error = 0;
 248         mach_vm_offset_t th_tsd_base;
 249         mach_port_name_t th_thport;
 250         thread_t th;
 251         task_t ctask = current_task();
 252         unsigned int policy, importance;
 253         uint32_t tsd_offset;
 254         bool start_suspended = (flags & PTHREAD_START_SUSPENDED);
 255
 256         if (pthread_kern->proc_get_register(p) == 0) {
 257                 return EINVAL;
 258         }
 259
 260         PTHREAD_TRACE(pthread_thread_create | DBG_FUNC_START, flags, 0, 0, 0);
 261
 262         kret = pthread_kern->thread_create(ctask, &th);
 263         if (kret != KERN_SUCCESS)
 264                 return(ENOMEM);
 265         thread_reference(th);
 266
 267         pthread_kern->thread_set_tag(th, THREAD_TAG_PTHREAD);
 268
 269         sright = (void *)pthread_kern->convert_thread_to_port(th);
 270         th_thport = pthread_kern->ipc_port_copyout_send(sright, pthread_kern->task_get_ipcspace(ctask));
 271         if (!MACH_PORT_VALID(th_thport)) {
 272                 error = EMFILE; // userland will convert this into a crash
 273                 goto out;
 274         }
 275
 276         if ((flags & PTHREAD_START_CUSTOM) == 0) {
 277                 error = EINVAL;
 278                 goto out;
 279         }
 280
 281         PTHREAD_TRACE(pthread_thread_create|DBG_FUNC_NONE, 0, 0, 0, 3);
 282
 283         tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 284         if (tsd_offset) {
 285                 th_tsd_base = user_pthread + tsd_offset;
 286                 kret = pthread_kern->thread_set_tsd_base(th, th_tsd_base);
 287                 if (kret == KERN_SUCCESS) {
 288                         flags |= PTHREAD_START_TSD_BASE_SET;
 289                 }
 290         }
 291         /*
 292          * Strip PTHREAD_START_SUSPENDED so that libpthread can observe the kernel
 293          * supports this flag (after the fact).
 294          */
 295         flags &= ~PTHREAD_START_SUSPENDED;
 296
 297         /*
 298          * Set up registers & function call.
 299          */
 300 #if defined(__i386__) || defined(__x86_64__)
 301         if (proc_is64bit_data(p)) {
 302                 x86_thread_state64_t state = {
 303                         .rip = (uint64_t)pthread_kern->proc_get_threadstart(p),
 304                         .rdi = (uint64_t)user_pthread,
 305                         .rsi = (uint64_t)th_thport,
 306                         .rdx = (uint64_t)user_func,    /* golang wants this */
 307                         .rcx = (uint64_t)user_funcarg, /* golang wants this */
 308                         .r8  = (uint64_t)user_stack,   /* golang wants this */
 309                         .r9  = (uint64_t)flags,
 310
 311                         .rsp = (uint64_t)(user_stack - C_64_REDZONE_LEN)
 312                 };
 313
 314                 (void)pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state);
 315         } else {
 316                 x86_thread_state32_t state = {
 317                         .eip = (uint32_t)pthread_kern->proc_get_threadstart(p),
 318                         .eax = (uint32_t)user_pthread,
 319                         .ebx = (uint32_t)th_thport,
 320                         .ecx = (uint32_t)user_func,    /* golang wants this */
 321                         .edx = (uint32_t)user_funcarg, /* golang wants this */
 322                         .edi = (uint32_t)user_stack,   /* golang wants this */
 323                         .esi = (uint32_t)flags,
 324
 325                         .esp = (int)((vm_offset_t)(user_stack - C_32_STK_ALIGN))
 326                 };
 327
 328                 (void)pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 329         }
 330 #else
 331 #error bsdthread_create  not defined for this architecture
 332 #endif
 333
 334         if (flags & PTHREAD_START_SETSCHED) {
 335                 /* Set scheduling parameters if needed */
 336                 thread_extended_policy_data_t    extinfo;
 337                 thread_precedence_policy_data_t   precedinfo;
 338
 339                 importance = (flags & PTHREAD_START_IMPORTANCE_MASK);
 340                 policy = (flags >> PTHREAD_START_POLICY_BITSHIFT) & PTHREAD_START_POLICY_MASK;
 341
 342                 if (policy == SCHED_OTHER) {
 343                         extinfo.timeshare = 1;
 344                 } else {
 345                         extinfo.timeshare = 0;
 346                 }
 347
 348                 thread_policy_set(th, THREAD_EXTENDED_POLICY, (thread_policy_t)&extinfo, THREAD_EXTENDED_POLICY_COUNT);
 349
 350                 precedinfo.importance = (importance - BASEPRI_DEFAULT);
 351                 thread_policy_set(th, THREAD_PRECEDENCE_POLICY, (thread_policy_t)&precedinfo, THREAD_PRECEDENCE_POLICY_COUNT);
 352         } else if (flags & PTHREAD_START_QOSCLASS) {
 353                 /* Set thread QoS class if requested. */
 354                 thread_qos_policy_data_t qos;
 355
 356                 if (!_pthread_priority_to_policy(flags & PTHREAD_START_QOSCLASS_MASK, &qos)) {
 357                         error = EINVAL;
 358                         goto out;
 359                 }
 360                 pthread_kern->thread_policy_set_internal(th, THREAD_QOS_POLICY,
 361                                 (thread_policy_t)&qos, THREAD_QOS_POLICY_COUNT);
 362         }
 363
 364         if (pthread_kern->proc_get_mach_thread_self_tsd_offset) {
 365                 uint64_t mach_thread_self_offset =
 366                                 pthread_kern->proc_get_mach_thread_self_tsd_offset(p);
 367                 if (mach_thread_self_offset && tsd_offset) {
 368                         bool proc64bit = proc_is64bit(p);
 369                         if (proc64bit) {
 370                                 uint64_t th_thport_tsd = (uint64_t)th_thport;
 371                                 error = copyout(&th_thport_tsd, user_pthread + tsd_offset +
 372                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 373                         } else {
 374                                 uint32_t th_thport_tsd = (uint32_t)th_thport;
 375                                 error = copyout(&th_thport_tsd, user_pthread + tsd_offset +
 376                                                 mach_thread_self_offset, sizeof(th_thport_tsd));
 377                         }
 378                         if (error) {
 379                                 goto out;
 380                         }
 381                 }
 382         }
 383
 384         if (!start_suspended) {
 385                 kret = pthread_kern->thread_resume(th);
 386                 if (kret != KERN_SUCCESS) {
 387                         error = EINVAL;
 388                         goto out;
 389                 }
 390         }
 391         thread_deallocate(th);  /* drop the creator reference */
 392
 393         PTHREAD_TRACE(pthread_thread_create|DBG_FUNC_END, error, user_pthread, 0, 0);
 394
 395         *retval = user_pthread;
 396         return(0);
 397
 398 out:
 399         (void)pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(ctask), th_thport);
 400         if (pthread_kern->thread_will_park_or_terminate) {
 401                 pthread_kern->thread_will_park_or_terminate(th);
 402         }
 403         (void)thread_terminate(th);
 404         (void)thread_deallocate(th);
 405         return(error);
 406 }
 407
 408 /**
 409  * bsdthread_terminate system call.  Used by pthread_terminate
 410  */
 411 int
 412 _bsdthread_terminate(__unused struct proc *p,
 413                      user_addr_t stackaddr,
 414                      size_t size,
 415                      uint32_t kthport,
 416                      uint32_t sem,
 417                      __unused int32_t *retval)
 418 {
 419         mach_vm_offset_t freeaddr;
 420         mach_vm_size_t freesize;
 421         kern_return_t kret;
 422         thread_t th = current_thread();
 423
 424         freeaddr = (mach_vm_offset_t)stackaddr;
 425         freesize = size;
 426
 427         PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_START, freeaddr, freesize, kthport, 0xff);
 428
 429         if ((freesize != (mach_vm_size_t)0) && (freeaddr != (mach_vm_offset_t)0)) {
 430                 if (pthread_kern->thread_get_tag(th) & THREAD_TAG_MAINTHREAD){
 431                         vm_map_t user_map = pthread_kern->current_map();
 432                         freesize = vm_map_trunc_page_mask((vm_map_offset_t)freesize - 1, vm_map_page_mask(user_map));
 433                         kret = mach_vm_behavior_set(user_map, freeaddr, freesize, VM_BEHAVIOR_REUSABLE);
 434 #if MACH_ASSERT
 435                         if (kret != KERN_SUCCESS && kret != KERN_INVALID_ADDRESS) {
 436                                 os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kret);
 437                         }
 438 #endif
 439                         kret = kret ? kret : mach_vm_protect(user_map, freeaddr, freesize, FALSE, VM_PROT_NONE);
 440                         assert(kret == KERN_SUCCESS || kret == KERN_INVALID_ADDRESS);
 441                 } else {
 442                         kret = mach_vm_deallocate(pthread_kern->current_map(), freeaddr, freesize);
 443                         if (kret != KERN_SUCCESS) {
 444                                 PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0);
 445                         }
 446                 }
 447         }
 448
 449         if (pthread_kern->thread_will_park_or_terminate) {
 450                 pthread_kern->thread_will_park_or_terminate(th);
 451         }
 452         (void)thread_terminate(th);
 453         if (sem != MACH_PORT_NULL) {
 454                 kret = pthread_kern->semaphore_signal_internal_trap(sem);
 455                 if (kret != KERN_SUCCESS) {
 456                         PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_END, kret, 0, 0, 0);
 457                 }
 458         }
 459
 460         if (kthport != MACH_PORT_NULL) {
 461                 pthread_kern->mach_port_deallocate(pthread_kern->task_get_ipcspace(current_task()), kthport);
 462         }
 463
 464         PTHREAD_TRACE(pthread_thread_terminate|DBG_FUNC_END, 0, 0, 0, 0);
 465
 466         pthread_kern->thread_exception_return();
 467         __builtin_unreachable();
 468 }
 469
 470 /**
 471  * bsdthread_register system call.  Performs per-process setup.  Responsible for
 472  * returning capabilitiy bits to userspace and receiving userspace function addresses.
 473  */
 474 int
 475 _bsdthread_register(struct proc *p,
 476                     user_addr_t threadstart,
 477                     user_addr_t wqthread,
 478                     int pthsize,
 479                     user_addr_t pthread_init_data,
 480                     user_addr_t pthread_init_data_size,
 481                     uint64_t dispatchqueue_offset,
 482                     int32_t *retval)
 483 {
 484         struct _pthread_registration_data data = {};
 485         uint32_t max_tsd_offset;
 486         kern_return_t kr;
 487         size_t pthread_init_sz = 0;
 488
 489         /* syscall randomizer test can pass bogus values */
 490         if (pthsize < 0 || pthsize > MAX_PTHREAD_SIZE) {
 491                 return(EINVAL);
 492         }
 493         /*
 494          * if we have pthread_init_data, then we use that and target_concptr
 495          * (which is an offset) get data.
 496          */
 497         if (pthread_init_data != 0) {
 498                 if (pthread_init_data_size < sizeof(data.version)) {
 499                         return EINVAL;
 500                 }
 501                 pthread_init_sz = MIN(sizeof(data), (size_t)pthread_init_data_size);
 502                 int ret = copyin(pthread_init_data, &data, pthread_init_sz);
 503                 if (ret) {
 504                         return ret;
 505                 }
 506                 if (data.version != (size_t)pthread_init_data_size) {
 507                         return EINVAL;
 508                 }
 509         } else {
 510                 data.dispatch_queue_offset = dispatchqueue_offset;
 511         }
 512
 513         /* We have to do this before proc_get_register so that it resets after fork */
 514         mach_vm_offset_t stackaddr = stack_addr_hint(p, pthread_kern->current_map());
 515         pthread_kern->proc_set_stack_addr_hint(p, (user_addr_t)stackaddr);
 516
 517         /* prevent multiple registrations */
 518         if (pthread_kern->proc_get_register(p) != 0) {
 519                 return(EINVAL);
 520         }
 521
 522         pthread_kern->proc_set_threadstart(p, threadstart);
 523         pthread_kern->proc_set_wqthread(p, wqthread);
 524         pthread_kern->proc_set_pthsize(p, pthsize);
 525         pthread_kern->proc_set_register(p);
 526
 527         uint32_t tsd_slot_sz = proc_is64bit(p) ? sizeof(uint64_t) : sizeof(uint32_t);
 528         if ((uint32_t)pthsize >= tsd_slot_sz &&
 529                         data.tsd_offset <= (uint32_t)(pthsize - tsd_slot_sz)) {
 530                 max_tsd_offset = ((uint32_t)pthsize - data.tsd_offset - tsd_slot_sz);
 531         } else {
 532                 data.tsd_offset = 0;
 533                 max_tsd_offset = 0;
 534         }
 535         pthread_kern->proc_set_pthread_tsd_offset(p, data.tsd_offset);
 536
 537         if (data.dispatch_queue_offset > max_tsd_offset) {
 538                 data.dispatch_queue_offset = 0;
 539         }
 540         pthread_kern->proc_set_dispatchqueue_offset(p, data.dispatch_queue_offset);
 541
 542         if (pthread_kern->proc_set_return_to_kernel_offset) {
 543                 if (data.return_to_kernel_offset > max_tsd_offset) {
 544                         data.return_to_kernel_offset = 0;
 545                 }
 546                 pthread_kern->proc_set_return_to_kernel_offset(p,
 547                                 data.return_to_kernel_offset);
 548         }
 549
 550         if (pthread_kern->proc_set_mach_thread_self_tsd_offset) {
 551                 if (data.mach_thread_self_offset > max_tsd_offset) {
 552                         data.mach_thread_self_offset = 0;
 553                 }
 554                 pthread_kern->proc_set_mach_thread_self_tsd_offset(p,
 555                                 data.mach_thread_self_offset);
 556         }
 557
 558         if (pthread_init_data != 0) {
 559                 /* Outgoing data that userspace expects as a reply */
 560                 data.version = sizeof(struct _pthread_registration_data);
 561                 data.main_qos = _pthread_unspecified_priority();
 562
 563                 if (pthread_kern->qos_main_thread_active()) {
 564                         mach_msg_type_number_t nqos = THREAD_QOS_POLICY_COUNT;
 565                         thread_qos_policy_data_t qos;
 566                         boolean_t gd = FALSE;
 567
 568                         kr = pthread_kern->thread_policy_get(current_thread(),
 569                                         THREAD_QOS_POLICY, (thread_policy_t)&qos, &nqos, &gd);
 570                         if (kr != KERN_SUCCESS || qos.qos_tier == THREAD_QOS_UNSPECIFIED) {
 571                                 /*
 572                                  * Unspecified threads means the kernel wants us
 573                                  * to impose legacy upon the thread.
 574                                  */
 575                                 qos.qos_tier = THREAD_QOS_LEGACY;
 576                                 qos.tier_importance = 0;
 577
 578                                 kr = pthread_kern->thread_policy_set_internal(current_thread(),
 579                                                 THREAD_QOS_POLICY, (thread_policy_t)&qos,
 580                                                 THREAD_QOS_POLICY_COUNT);
 581                         }
 582
 583                         if (kr == KERN_SUCCESS) {
 584                                 data.main_qos = _pthread_priority_make_from_thread_qos(
 585                                                 qos.qos_tier, 0, 0);
 586                         }
 587                 }
 588
 589                 data.stack_addr_hint = stackaddr;
 590                 data.mutex_default_policy = pthread_mutex_default_policy;
 591
 592                 kr = copyout(&data, pthread_init_data, pthread_init_sz);
 593                 if (kr != KERN_SUCCESS) {
 594                         return EINVAL;
 595                 }
 596         }
 597
 598         /* return the supported feature set as the return value. */
 599         *retval = PTHREAD_FEATURE_SUPPORTED;
 600
 601         return(0);
 602 }
 603
 604
 605 #pragma mark - Workqueue Thread Support
 606
 607 static mach_vm_size_t
 608 workq_thread_allocsize(proc_t p, vm_map_t wq_map,
 609                 mach_vm_size_t *guardsize_out)
 610 {
 611         mach_vm_size_t guardsize = vm_map_page_size(wq_map);
 612         mach_vm_size_t pthread_size = vm_map_round_page_mask(
 613                         pthread_kern->proc_get_pthsize(p) + PTHREAD_T_OFFSET,
 614                         vm_map_page_mask(wq_map));
 615         if (guardsize_out) *guardsize_out = guardsize;
 616         return guardsize + PTH_DEFAULT_STACKSIZE + pthread_size;
 617 }
 618
 619 int
 620 workq_create_threadstack(proc_t p, vm_map_t vmap, mach_vm_offset_t *out_addr)
 621 {
 622         mach_vm_offset_t stackaddr = pthread_kern->proc_get_stack_addr_hint(p);
 623         mach_vm_size_t guardsize, th_allocsize;
 624         kern_return_t kret;
 625
 626         th_allocsize = workq_thread_allocsize(p, vmap, &guardsize);
 627         kret = mach_vm_map(vmap, &stackaddr, th_allocsize, page_size - 1,
 628                         VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE, NULL, 0, FALSE,
 629                         VM_PROT_DEFAULT, VM_PROT_ALL, VM_INHERIT_DEFAULT);
 630
 631         if (kret != KERN_SUCCESS) {
 632                 kret = mach_vm_allocate(vmap, &stackaddr, th_allocsize,
 633                                 VM_MAKE_TAG(VM_MEMORY_STACK) | VM_FLAGS_ANYWHERE);
 634         }
 635
 636         if (kret != KERN_SUCCESS) {
 637                 goto fail;
 638         }
 639
 640         /*
 641          * The guard page is at the lowest address
 642          * The stack base is the highest address
 643          */
 644         kret = mach_vm_protect(vmap, stackaddr, guardsize, FALSE, VM_PROT_NONE);
 645         if (kret != KERN_SUCCESS) {
 646                 goto fail_vm_deallocate;
 647         }
 648
 649         if (out_addr) {
 650                 *out_addr = stackaddr;
 651         }
 652         return 0;
 653
 654 fail_vm_deallocate:
 655         (void)mach_vm_deallocate(vmap, stackaddr, th_allocsize);
 656 fail:
 657         return kret;
 658 }
 659
 660 int
 661 workq_destroy_threadstack(proc_t p, vm_map_t vmap, mach_vm_offset_t stackaddr)
 662 {
 663         return mach_vm_deallocate(vmap, stackaddr,
 664                         workq_thread_allocsize(p, vmap, NULL));
 665 }
 666
 667 void
 668 workq_markfree_threadstack(proc_t OS_UNUSED p, thread_t OS_UNUSED th,
 669                 vm_map_t vmap, user_addr_t stackaddr)
 670 {
 671         // Keep this in sync with workq_setup_thread()
 672         const vm_size_t       guardsize = vm_map_page_size(vmap);
 673         const user_addr_t     freeaddr = (user_addr_t)stackaddr + guardsize;
 674         const vm_map_offset_t freesize = vm_map_trunc_page_mask(
 675                         (PTH_DEFAULT_STACKSIZE + guardsize + PTHREAD_T_OFFSET) - 1,
 676                         vm_map_page_mask(vmap)) - guardsize;
 677
 678         __assert_only kern_return_t kr = mach_vm_behavior_set(vmap, freeaddr,
 679                         freesize, VM_BEHAVIOR_REUSABLE);
 680 #if MACH_ASSERT
 681         if (kr != KERN_SUCCESS && kr != KERN_INVALID_ADDRESS) {
 682                 os_log_error(OS_LOG_DEFAULT, "unable to make thread stack reusable (kr: %d)", kr);
 683         }
 684 #endif
 685 }
 686
 687 struct workq_thread_addrs {
 688         user_addr_t self;
 689         user_addr_t stack_bottom;
 690         user_addr_t stack_top;
 691 };
 692
 693 static inline void
 694 workq_thread_set_top_addr(struct workq_thread_addrs *th_addrs, user_addr_t addr)
 695 {
 696         th_addrs->stack_top = (addr & -C_WORKQ_STK_ALIGN);
 697 }
 698
 699 static void
 700 workq_thread_get_addrs(vm_map_t map, user_addr_t stackaddr,
 701                                            struct workq_thread_addrs *th_addrs)
 702 {
 703         const vm_size_t guardsize = vm_map_page_size(map);
 704
 705         th_addrs->self = (user_addr_t)(stackaddr + PTH_DEFAULT_STACKSIZE +
 706                         guardsize + PTHREAD_T_OFFSET);
 707         workq_thread_set_top_addr(th_addrs, th_addrs->self);
 708         th_addrs->stack_bottom = (user_addr_t)(stackaddr + guardsize);
 709 }
 710
 711 static inline void
 712 workq_set_register_state(proc_t p, thread_t th,
 713                 struct workq_thread_addrs *addrs, mach_port_name_t kport,
 714                 user_addr_t kevent_list, uint32_t upcall_flags, int kevent_count)
 715 {
 716         user_addr_t wqstart_fnptr = pthread_kern->proc_get_wqthread(p);
 717         if (!wqstart_fnptr) {
 718                 panic("workqueue thread start function pointer is NULL");
 719         }
 720
 721 #if defined(__i386__) || defined(__x86_64__)
 722         if (proc_is64bit_data(p) == 0) {
 723                 x86_thread_state32_t state = {
 724                         .eip = (unsigned int)wqstart_fnptr,
 725                         .eax = /* arg0 */ (unsigned int)addrs->self,
 726                         .ebx = /* arg1 */ (unsigned int)kport,
 727                         .ecx = /* arg2 */ (unsigned int)addrs->stack_bottom,
 728                         .edx = /* arg3 */ (unsigned int)kevent_list,
 729                         .edi = /* arg4 */ (unsigned int)upcall_flags,
 730                         .esi = /* arg5 */ (unsigned int)kevent_count,
 731
 732                         .esp = (int)((vm_offset_t)addrs->stack_top),
 733                 };
 734
 735                 int error = pthread_kern->thread_set_wq_state32(th, (thread_state_t)&state);
 736                 if (error != KERN_SUCCESS) {
 737                         panic(__func__ ": thread_set_wq_state failed: %d", error);
 738                 }
 739         } else {
 740                 x86_thread_state64_t state64 = {
 741                         // x86-64 already passes all the arguments in registers, so we just put them in their final place here
 742                         .rip = (uint64_t)wqstart_fnptr,
 743                         .rdi = (uint64_t)addrs->self,
 744                         .rsi = (uint64_t)kport,
 745                         .rdx = (uint64_t)addrs->stack_bottom,
 746                         .rcx = (uint64_t)kevent_list,
 747                         .r8  = (uint64_t)upcall_flags,
 748                         .r9  = (uint64_t)kevent_count,
 749
 750                         .rsp = (uint64_t)(addrs->stack_top)
 751                 };
 752
 753                 int error = pthread_kern->thread_set_wq_state64(th, (thread_state_t)&state64);
 754                 if (error != KERN_SUCCESS) {
 755                         panic(__func__ ": thread_set_wq_state failed: %d", error);
 756                 }
 757         }
 758 #else
 759 #error setup_wqthread  not defined for this architecture
 760 #endif
 761 }
 762
 763 static int
 764 workq_kevent(proc_t p, struct workq_thread_addrs *th_addrs, int upcall_flags,
 765                 user_addr_t eventlist, int nevents, int kevent_flags,
 766                 user_addr_t *kevent_list_out, int *kevent_count_out)
 767 {
 768         bool workloop = upcall_flags & WQ_FLAG_THREAD_WORKLOOP;
 769         int kevent_count = WQ_KEVENT_LIST_LEN;
 770         user_addr_t kevent_list = th_addrs->self - WQ_KEVENT_LIST_LEN * sizeof(struct kevent_qos_s);
 771         user_addr_t kevent_id_addr = kevent_list;
 772         kqueue_id_t kevent_id = -1;
 773         int ret;
 774
 775         if (workloop) {
 776                 /*
 777                  * The kevent ID goes just below the kevent list.  Sufficiently new
 778                  * userspace will know to look there.  Old userspace will just
 779                  * ignore it.
 780                  */
 781                 kevent_id_addr -= sizeof(kqueue_id_t);
 782         }
 783
 784         user_addr_t kevent_data_buf = kevent_id_addr - WQ_KEVENT_DATA_SIZE;
 785         user_size_t kevent_data_available = WQ_KEVENT_DATA_SIZE;
 786
 787         if (workloop) {
 788                 kevent_flags |= KEVENT_FLAG_WORKLOOP;
 789                 ret = kevent_id_internal(p, &kevent_id,
 790                                 eventlist, nevents, kevent_list, kevent_count,
 791                                 kevent_data_buf, &kevent_data_available,
 792                                 kevent_flags, &kevent_count);
 793                 copyout(&kevent_id, kevent_id_addr, sizeof(kevent_id));
 794         } else {
 795                 kevent_flags |= KEVENT_FLAG_WORKQ;
 796                 ret = kevent_qos_internal(p, -1, eventlist, nevents, kevent_list,
 797                                 kevent_count, kevent_data_buf, &kevent_data_available,
 798                                 kevent_flags, &kevent_count);
 799         }
 800
 801         // squash any errors into just empty output
 802         if (ret != 0 || kevent_count == -1) {
 803                 *kevent_list_out = NULL;
 804                 *kevent_count_out = 0;
 805                 return ret;
 806         }
 807
 808         if (kevent_data_available == WQ_KEVENT_DATA_SIZE) {
 809                 workq_thread_set_top_addr(th_addrs, kevent_id_addr);
 810         } else {
 811                 workq_thread_set_top_addr(th_addrs,
 812                                 kevent_data_buf + kevent_data_available);
 813         }
 814         *kevent_count_out = kevent_count;
 815         *kevent_list_out = kevent_list;
 816         return ret;
 817 }
 818
 819 /**
 820  * configures initial thread stack/registers to jump into:
 821  * _pthread_wqthread(pthread_t self, mach_port_t kport, void *stackaddr, void *keventlist, int upcall_flags, int nkevents);
 822  * to get there we jump through assembily stubs in pthread_asm.s.  Those
 823  * routines setup a stack frame, using the current stack pointer, and marshall
 824  * arguments from registers to the stack as required by the ABI.
 825  *
 826  * One odd thing we do here is to start the pthread_t 4k below what would be the
 827  * top of the stack otherwise.  This is because usually only the first 4k of the
 828  * pthread_t will be used and so we want to put it on the same 16k page as the
 829  * top of the stack to save memory.
 830  *
 831  * When we are done the stack will look like:
 832  * |-----------| th_stackaddr + th_allocsize
 833  * |pthread_t  | th_stackaddr + DEFAULT_STACKSIZE + guardsize + PTHREAD_STACK_OFFSET
 834  * |kevent list| optionally - at most WQ_KEVENT_LIST_LEN events
 835  * |kevent data| optionally - at most WQ_KEVENT_DATA_SIZE bytes
 836  * |stack gap  | bottom aligned to 16 bytes, and at least as big as stack_gap_min
 837  * |   STACK   |
 838  * |     ⇓     |
 839  * |           |
 840  * |guard page | guardsize
 841  * |-----------| th_stackaddr
 842  */
 843 __attribute__((noreturn,noinline))
 844 void
 845 workq_setup_thread(proc_t p, thread_t th, vm_map_t map, user_addr_t stackaddr,
 846                 mach_port_name_t kport, int th_qos __unused, int setup_flags, int upcall_flags)
 847 {
 848         struct workq_thread_addrs th_addrs;
 849         bool first_use = (setup_flags & WQ_SETUP_FIRST_USE);
 850         user_addr_t kevent_list = NULL;
 851         int kevent_count = 0;
 852
 853         workq_thread_get_addrs(map, stackaddr, &th_addrs);
 854
 855         if (first_use) {
 856                 uint32_t tsd_offset = pthread_kern->proc_get_pthread_tsd_offset(p);
 857                 if (tsd_offset) {
 858                         mach_vm_offset_t th_tsd_base = th_addrs.self + tsd_offset;
 859                         kern_return_t kret = pthread_kern->thread_set_tsd_base(th,
 860                                         th_tsd_base);
 861                         if (kret == KERN_SUCCESS) {
 862                                 upcall_flags |= WQ_FLAG_THREAD_TSD_BASE_SET;
 863                         }
 864                 }
 865
 866                 /*
 867                  * Pre-fault the first page of the new thread's stack and the page that will
 868                  * contain the pthread_t structure.
 869                  */
 870                 vm_map_offset_t mask = vm_map_page_mask(map);
 871                 vm_map_offset_t th_page = vm_map_trunc_page_mask(th_addrs.self, mask);
 872                 vm_map_offset_t stk_page = vm_map_trunc_page_mask(th_addrs.stack_top - 1, mask);
 873                 if (th_page != stk_page) {
 874                         vm_fault(map, stk_page, VM_PROT_READ | VM_PROT_WRITE, FALSE, THREAD_UNINT, NULL, 0);
 875                 }
 876                 vm_fault(map, th_page, VM_PROT_READ | VM_PROT_WRITE, FALSE, THREAD_UNINT, NULL, 0);
 877         }
 878
 879         if (setup_flags & WQ_SETUP_EXIT_THREAD) {
 880                 kevent_count = WORKQ_EXIT_THREAD_NKEVENT;
 881         } else if (upcall_flags & WQ_FLAG_THREAD_KEVENT) {
 882                 unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE;
 883                 workq_kevent(p, &th_addrs, upcall_flags, NULL, 0, flags,
 884                                 &kevent_list, &kevent_count);
 885         }
 886
 887         workq_set_register_state(p, th, &th_addrs, kport,
 888                         kevent_list, upcall_flags, kevent_count);
 889
 890         if (first_use) {
 891                 pthread_kern->thread_bootstrap_return();
 892         } else {
 893                 pthread_kern->unix_syscall_return(EJUSTRETURN);
 894         }
 895         __builtin_unreachable();
 896 }
 897
 898 int
 899 workq_handle_stack_events(proc_t p, thread_t th, vm_map_t map,
 900                 user_addr_t stackaddr, mach_port_name_t kport,
 901                 user_addr_t events, int nevents, int upcall_flags)
 902 {
 903         struct workq_thread_addrs th_addrs;
 904         user_addr_t kevent_list = NULL;
 905         int kevent_count = 0, error;
 906         __assert_only kern_return_t kr;
 907
 908         workq_thread_get_addrs(map, stackaddr, &th_addrs);
 909
 910         unsigned int flags = KEVENT_FLAG_STACK_DATA | KEVENT_FLAG_IMMEDIATE |
 911                         KEVENT_FLAG_PARKING;
 912         error = workq_kevent(p, &th_addrs, upcall_flags, events, nevents, flags,
 913                         &kevent_list, &kevent_count);
 914
 915         if (error || kevent_count == 0) {
 916                 return error;
 917         }
 918
 919         kr = pthread_kern->thread_set_voucher_name(MACH_PORT_NULL);
 920         assert(kr == KERN_SUCCESS);
 921
 922         workq_set_register_state(p, th, &th_addrs, kport,
 923                         kevent_list, upcall_flags, kevent_count);
 924
 925         pthread_kern->unix_syscall_return(EJUSTRETURN);
 926         __builtin_unreachable();
 927 }
 928
 929 int
 930 _thread_selfid(__unused struct proc *p, uint64_t *retval)
 931 {
 932         thread_t thread = current_thread();
 933         *retval = thread_tid(thread);
 934         return KERN_SUCCESS;
 935 }
 936
 937 void
 938 _pthread_init(void)
 939 {
 940         pthread_lck_grp_attr = lck_grp_attr_alloc_init();
 941         pthread_lck_grp = lck_grp_alloc_init("pthread", pthread_lck_grp_attr);
 942
 943         /*
 944          * allocate the lock attribute for pthread synchronizers
 945          */
 946         pthread_lck_attr = lck_attr_alloc_init();
 947         pthread_list_mlock = lck_mtx_alloc_init(pthread_lck_grp, pthread_lck_attr);
 948
 949         pth_global_hashinit();
 950         psynch_thcall = thread_call_allocate(psynch_wq_cleanup, NULL);
 951         psynch_zoneinit();
 952
 953         int policy_bootarg;
 954         if (PE_parse_boot_argn("pthread_mutex_default_policy", &policy_bootarg, sizeof(policy_bootarg))) {
 955                 pthread_mutex_default_policy = policy_bootarg;
 956         }
 957
 958         sysctl_register_oid(&sysctl__kern_pthread_mutex_default_policy);
 959 }